diff --git a/openmp/libomptarget/CMakeLists.txt b/openmp/libomptarget/CMakeLists.txt index c1bc29faaf45d..541373133a909 100644 --- a/openmp/libomptarget/CMakeLists.txt +++ b/openmp/libomptarget/CMakeLists.txt @@ -1,85 +1,85 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build offloading library and related plugins. -# -##===----------------------------------------------------------------------===## - -if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") - message(FATAL_ERROR "Direct configuration not supported, please use parent directory!") -endif() - -# Add cmake directory to search for custom cmake functions. -set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH}) - -if(OPENMP_STANDALONE_BUILD) - # Build all libraries into a common place so that tests can find them. - set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) - set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) -endif() - -# Message utilities. -include(LibomptargetUtils) - -# Get dependencies for the different components of the project. -include(LibomptargetGetDependencies) - -# This is a list of all the targets that are supported/tested right now. -set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu") -set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu") -set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu") -set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu") -set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda") - -# Once the plugins for the different targets are validated, they will be added to -# the list of supported targets in the current system. -set (LIBOMPTARGET_SYSTEM_TARGETS "") -set (LIBOMPTARGET_TESTED_PLUGINS "") - -# Check whether using debug mode. In debug mode, allow dumping progress -# messages at runtime by default. Otherwise, it can be enabled -# independently using the LIBOMPTARGET_ENABLE_DEBUG option. -string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE) -if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) - option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" ON) -else() - option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" OFF) -endif() -if(LIBOMPTARGET_ENABLE_DEBUG) - add_definitions(-DOMPTARGET_DEBUG) -endif() - -include_directories(include) - -# Build target agnostic offloading library. -add_subdirectory(src) - -# Retrieve the path to the resulting library so that it can be used for -# testing. -get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY) -if(NOT LIBOMPTARGET_LIBRARY_DIR) - set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) -endif() - -# Definitions for testing, for reuse when testing libomptarget-nvptx. -if(OPENMP_STANDALONE_BUILD) - set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING - "Path to folder containing omp.h") - set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING - "Path to folder containing libomp.so") -else() - set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src") -endif() - - -# Build offloading plugins and device RTLs if they are available. -add_subdirectory(plugins) -add_subdirectory(deviceRTLs) - -# Add tests. -add_subdirectory(test) +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build offloading library and related plugins. +# +##===----------------------------------------------------------------------===## + +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") + message(FATAL_ERROR "Direct configuration not supported, please use parent directory!") +endif() + +# Add cmake directory to search for custom cmake functions. +set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules ${CMAKE_MODULE_PATH}) + +if(OPENMP_STANDALONE_BUILD) + # Build all libraries into a common place so that tests can find them. + set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +endif() + +# Message utilities. +include(LibomptargetUtils) + +# Get dependencies for the different components of the project. +include(LibomptargetGetDependencies) + +# This is a list of all the targets that are supported/tested right now. +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} aarch64-unknown-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64le-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-pc-linux-gnu") +set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda") + +# Once the plugins for the different targets are validated, they will be added to +# the list of supported targets in the current system. +set (LIBOMPTARGET_SYSTEM_TARGETS "") +set (LIBOMPTARGET_TESTED_PLUGINS "") + +# Check whether using debug mode. In debug mode, allow dumping progress +# messages at runtime by default. Otherwise, it can be enabled +# independently using the LIBOMPTARGET_ENABLE_DEBUG option. +string( TOLOWER "${CMAKE_BUILD_TYPE}" LIBOMPTARGET_CMAKE_BUILD_TYPE) +if(LIBOMPTARGET_CMAKE_BUILD_TYPE MATCHES debug) + option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" ON) +else() + option(LIBOMPTARGET_ENABLE_DEBUG "Allow debug output with the environment variable LIBOMPTARGET_DEBUG=1" OFF) +endif() +if(LIBOMPTARGET_ENABLE_DEBUG) + add_definitions(-DOMPTARGET_DEBUG) +endif() + +include_directories(include) + +# Build target agnostic offloading library. +add_subdirectory(src) + +# Retrieve the path to the resulting library so that it can be used for +# testing. +get_target_property(LIBOMPTARGET_LIBRARY_DIR omptarget LIBRARY_OUTPUT_DIRECTORY) +if(NOT LIBOMPTARGET_LIBRARY_DIR) + set(LIBOMPTARGET_LIBRARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) +endif() + +# Definitions for testing, for reuse when testing libomptarget-nvptx. +if(OPENMP_STANDALONE_BUILD) + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING + "Path to folder containing omp.h") + set(LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src" CACHE STRING + "Path to folder containing libomp.so") +else() + set(LIBOMPTARGET_OPENMP_HEADER_FOLDER "${CMAKE_CURRENT_BINARY_DIR}/../runtime/src") +endif() + + +# Build offloading plugins and device RTLs if they are available. +add_subdirectory(plugins) +add_subdirectory(deviceRTLs) + +# Add tests. +add_subdirectory(test) diff --git a/openmp/libomptarget/README.txt b/openmp/libomptarget/README.txt index 8c0a83729fdbe..18c7a0e970965 100644 --- a/openmp/libomptarget/README.txt +++ b/openmp/libomptarget/README.txt @@ -1,73 +1,73 @@ - - README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget) - ====================================================================== - -How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget) -======================================================================== -In-tree build: - -$ cd where-you-want-to-live -Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects -$ cd where-you-want-to-build -$ mkdir build && cd build -$ cmake path/to/llvm -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= -$ make omptarget - -Out-of-tree build: - -$ cd where-you-want-to-live -Check out openmp (libomptarget lives under ./libomptarget) -$ cd where-you-want-to-live/openmp/libomptarget -$ mkdir build && cd build -$ cmake path/to/openmp -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= -$ make - -For details about building, please look at README.rst in the parent directory. - -Architectures Supported -======================= -The current library has been only tested in Linux operating system and the -following host architectures: -* Intel(R) 64 architecture -* IBM(R) Power architecture (big endian) -* IBM(R) Power architecture (little endian) -* ARM(R) AArch64 architecture (little endian) - -The currently supported offloading device architectures are: -* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes) -* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes) -* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) -* ARM(R) AArch64 architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) -* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures - -Supported RTL Build Configurations -================================== -Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8 - - --------------------------- - | gcc | clang | ---------------|------------|------------| -| Linux* OS | Yes(1) | Yes(2) | ------------------------------------------ - -(1) gcc version 4.8.2 or later is supported. -(2) clang version 3.7 or later is supported. - - -Front-end Compilers that work with this RTL -=========================================== - -The following compilers are known to do compatible code generation for -this RTL: - - clang (from https://github.com/clang-ykt ) - - clang (development branch at http://clang.llvm.org - several features still - under development) - ------------------------------------------------------------------------ - -Notices -======= -This library and related compiler support is still under development, so the -employed interface is likely to change in the future. - -*Other names and brands may be claimed as the property of others. + + README for the LLVM* OpenMP* Offloading Runtime Library (libomptarget) + ====================================================================== + +How to Build the LLVM* OpenMP* Offloading Runtime Library (libomptarget) +======================================================================== +In-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) into llvm/projects +$ cd where-you-want-to-build +$ mkdir build && cd build +$ cmake path/to/llvm -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= +$ make omptarget + +Out-of-tree build: + +$ cd where-you-want-to-live +Check out openmp (libomptarget lives under ./libomptarget) +$ cd where-you-want-to-live/openmp/libomptarget +$ mkdir build && cd build +$ cmake path/to/openmp -DCMAKE_C_COMPILER= -DCMAKE_CXX_COMPILER= +$ make + +For details about building, please look at README.rst in the parent directory. + +Architectures Supported +======================= +The current library has been only tested in Linux operating system and the +following host architectures: +* Intel(R) 64 architecture +* IBM(R) Power architecture (big endian) +* IBM(R) Power architecture (little endian) +* ARM(R) AArch64 architecture (little endian) + +The currently supported offloading device architectures are: +* Intel(R) 64 architecture (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (big endian) (generic 64-bit plugin - mostly for testing purposes) +* IBM(R) Power architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) +* ARM(R) AArch64 architecture (little endian) (generic 64-bit plugin - mostly for testing purposes) +* CUDA(R) enabled 64-bit NVIDIA(R) GPU architectures + +Supported RTL Build Configurations +================================== +Supported Architectures: Intel(R) 64, IBM(R) Power 7 and Power 8 + + --------------------------- + | gcc | clang | +--------------|------------|------------| +| Linux* OS | Yes(1) | Yes(2) | +----------------------------------------- + +(1) gcc version 4.8.2 or later is supported. +(2) clang version 3.7 or later is supported. + + +Front-end Compilers that work with this RTL +=========================================== + +The following compilers are known to do compatible code generation for +this RTL: + - clang (from https://github.com/clang-ykt ) + - clang (development branch at http://clang.llvm.org - several features still + under development) + +----------------------------------------------------------------------- + +Notices +======= +This library and related compiler support is still under development, so the +employed interface is likely to change in the future. + +*Other names and brands may be claimed as the property of others. diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake index dbf8c381de139..ae05405e9f2fd 100644 --- a/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetGetDependencies.cmake @@ -1,192 +1,192 @@ -# -#//===----------------------------------------------------------------------===// -#// -#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -#// See https://llvm.org/LICENSE.txt for license information. -#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#// -#//===----------------------------------------------------------------------===// -# - -# Try to detect in the system several dependencies required by the different -# components of libomptarget. These are the dependencies we have: -# -# libelf : required by some targets to handle the ELF files at runtime. -# libffi : required to launch target kernels given function and argument -# pointers. -# CUDA : required to control offloading to NVIDIA GPUs. - -include (FindPackageHandleStandardArgs) - -################################################################################ -# Looking for libelf... -################################################################################ - -find_path ( - LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR - NAMES - libelf.h - PATHS - /usr/include - /usr/local/include - /opt/local/include - /sw/include - ENV CPATH - PATH_SUFFIXES - libelf) - -find_library ( - LIBOMPTARGET_DEP_LIBELF_LIBRARIES - NAMES - elf - PATHS - /usr/lib - /usr/local/lib - /opt/local/lib - /sw/lib - ENV LIBRARY_PATH - ENV LD_LIBRARY_PATH) - -set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) -find_package_handle_standard_args( - LIBOMPTARGET_DEP_LIBELF - DEFAULT_MSG - LIBOMPTARGET_DEP_LIBELF_LIBRARIES - LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS) - -mark_as_advanced( - LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS - LIBOMPTARGET_DEP_LIBELF_LIBRARIES) - -################################################################################ -# Looking for libffi... -################################################################################ -find_package(PkgConfig) - -pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi) - -find_path ( - LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR - NAMES - ffi.h - HINTS - ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR} - ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS} - PATHS - /usr/include - /usr/local/include - /opt/local/include - /sw/include - ENV CPATH) - -# Don't bother look for the library if the header files were not found. -if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR) - find_library ( - LIBOMPTARGET_DEP_LIBFFI_LIBRARIES - NAMES - ffi - HINTS - ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR} - ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS} - PATHS - /usr/lib - /usr/local/lib - /opt/local/lib - /sw/lib - ENV LIBRARY_PATH - ENV LD_LIBRARY_PATH) -endif() - -set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) -find_package_handle_standard_args( - LIBOMPTARGET_DEP_LIBFFI - DEFAULT_MSG - LIBOMPTARGET_DEP_LIBFFI_LIBRARIES - LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS) - -mark_as_advanced( - LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS - LIBOMPTARGET_DEP_LIBFFI_LIBRARIES) - -################################################################################ -# Looking for CUDA... -################################################################################ -if (CUDA_TOOLKIT_ROOT_DIR) - set(LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET TRUE) -endif() -find_package(CUDA QUIET) - -set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND}) -set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) - -mark_as_advanced( - LIBOMPTARGET_DEP_CUDA_FOUND - LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS) - -################################################################################ -# Looking for CUDA Driver API... (needed for CUDA plugin) -################################################################################ - -find_library ( - LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES - NAMES - cuda - PATHS - /lib64) - -# There is a libcuda.so in lib64/stubs that can be used for linking. -if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND) - # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this - # case CUDA_LIBRARIES contains additional linker arguments which breaks - # get_filename_component below. Fortunately, since that change the module - # exports CUDA_cudart_static_LIBRARY which points to a single file in the - # right directory. - set(cuda_library ${CUDA_LIBRARIES}) - if (DEFINED CUDA_cudart_static_LIBRARY) - set(cuda_library ${CUDA_cudart_static_LIBRARY}) - endif() - get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY) - find_library ( - LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES - NAMES - cuda - HINTS - "${CUDA_LIBDIR}/stubs") -endif() - -find_package_handle_standard_args( - LIBOMPTARGET_DEP_CUDA_DRIVER - DEFAULT_MSG - LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES) - -mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES) - -################################################################################ -# Looking for CUDA libdevice subdirectory -# -# Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work -# out of the box. More info on http://bugs.debian.org/882505 -################################################################################ - -set(LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR nvvm/libdevice) - -# Don't alter CUDA_TOOLKIT_ROOT_DIR if the user specified it, if a value was -# already cached for it, or if it already has libdevice. Otherwise, on -# Debian/Ubuntu, look where the nvidia-cuda-toolkit package normally installs -# libdevice. -if (NOT LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET AND - NOT EXISTS - "${CUDA_TOOLKIT_ROOT_DIR}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}") - find_program(LSB_RELEASE lsb_release) - if (LSB_RELEASE) - execute_process(COMMAND ${LSB_RELEASE} -is - OUTPUT_VARIABLE LSB_RELEASE_ID - OUTPUT_STRIP_TRAILING_WHITESPACE) - set(candidate_dir /usr/lib/cuda) - if ((LSB_RELEASE_ID STREQUAL "Debian" OR LSB_RELEASE_ID STREQUAL "Ubuntu") - AND EXISTS "${candidate_dir}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}") - set(CUDA_TOOLKIT_ROOT_DIR "${candidate_dir}" CACHE PATH - "Toolkit location." FORCE) - endif() - endif() -endif() +# +#//===----------------------------------------------------------------------===// +#// +#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +#// See https://llvm.org/LICENSE.txt for license information. +#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#// +#//===----------------------------------------------------------------------===// +# + +# Try to detect in the system several dependencies required by the different +# components of libomptarget. These are the dependencies we have: +# +# libelf : required by some targets to handle the ELF files at runtime. +# libffi : required to launch target kernels given function and argument +# pointers. +# CUDA : required to control offloading to NVIDIA GPUs. + +include (FindPackageHandleStandardArgs) + +################################################################################ +# Looking for libelf... +################################################################################ + +find_path ( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR + NAMES + libelf.h + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH + PATH_SUFFIXES + libelf) + +find_library ( + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + NAMES + elf + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) + +set(LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBELF + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBELF_LIBRARIES + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBELF_LIBRARIES) + +################################################################################ +# Looking for libffi... +################################################################################ +find_package(PkgConfig) + +pkg_check_modules(LIBOMPTARGET_SEARCH_LIBFFI QUIET libffi) + +find_path ( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR + NAMES + ffi.h + HINTS + ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDEDIR} + ${LIBOMPTARGET_SEARCH_LIBFFI_INCLUDE_DIRS} + PATHS + /usr/include + /usr/local/include + /opt/local/include + /sw/include + ENV CPATH) + +# Don't bother look for the library if the header files were not found. +if (LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR) + find_library ( + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + NAMES + ffi + HINTS + ${LIBOMPTARGET_SEARCH_LIBFFI_LIBDIR} + ${LIBOMPTARGET_SEARCH_LIBFFI_LIBRARY_DIRS} + PATHS + /usr/lib + /usr/local/lib + /opt/local/lib + /sw/lib + ENV LIBRARY_PATH + ENV LD_LIBRARY_PATH) +endif() + +set(LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS ${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) +find_package_handle_standard_args( + LIBOMPTARGET_DEP_LIBFFI + DEFAULT_MSG + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS) + +mark_as_advanced( + LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIRS + LIBOMPTARGET_DEP_LIBFFI_LIBRARIES) + +################################################################################ +# Looking for CUDA... +################################################################################ +if (CUDA_TOOLKIT_ROOT_DIR) + set(LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET TRUE) +endif() +find_package(CUDA QUIET) + +set(LIBOMPTARGET_DEP_CUDA_FOUND ${CUDA_FOUND}) +set(LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + +mark_as_advanced( + LIBOMPTARGET_DEP_CUDA_FOUND + LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS) + +################################################################################ +# Looking for CUDA Driver API... (needed for CUDA plugin) +################################################################################ + +find_library ( + LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES + NAMES + cuda + PATHS + /lib64) + +# There is a libcuda.so in lib64/stubs that can be used for linking. +if (NOT LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES AND CUDA_FOUND) + # Since CMake 3.3 FindCUDA.cmake defaults to using static libraries. In this + # case CUDA_LIBRARIES contains additional linker arguments which breaks + # get_filename_component below. Fortunately, since that change the module + # exports CUDA_cudart_static_LIBRARY which points to a single file in the + # right directory. + set(cuda_library ${CUDA_LIBRARIES}) + if (DEFINED CUDA_cudart_static_LIBRARY) + set(cuda_library ${CUDA_cudart_static_LIBRARY}) + endif() + get_filename_component(CUDA_LIBDIR ${cuda_library} DIRECTORY) + find_library ( + LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES + NAMES + cuda + HINTS + "${CUDA_LIBDIR}/stubs") +endif() + +find_package_handle_standard_args( + LIBOMPTARGET_DEP_CUDA_DRIVER + DEFAULT_MSG + LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES) + +mark_as_advanced(LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES) + +################################################################################ +# Looking for CUDA libdevice subdirectory +# +# Special case for Debian/Ubuntu to have nvidia-cuda-toolkit work +# out of the box. More info on http://bugs.debian.org/882505 +################################################################################ + +set(LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR nvvm/libdevice) + +# Don't alter CUDA_TOOLKIT_ROOT_DIR if the user specified it, if a value was +# already cached for it, or if it already has libdevice. Otherwise, on +# Debian/Ubuntu, look where the nvidia-cuda-toolkit package normally installs +# libdevice. +if (NOT LIBOMPTARGET_CUDA_TOOLKIT_ROOT_DIR_PRESET AND + NOT EXISTS + "${CUDA_TOOLKIT_ROOT_DIR}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}") + find_program(LSB_RELEASE lsb_release) + if (LSB_RELEASE) + execute_process(COMMAND ${LSB_RELEASE} -is + OUTPUT_VARIABLE LSB_RELEASE_ID + OUTPUT_STRIP_TRAILING_WHITESPACE) + set(candidate_dir /usr/lib/cuda) + if ((LSB_RELEASE_ID STREQUAL "Debian" OR LSB_RELEASE_ID STREQUAL "Ubuntu") + AND EXISTS "${candidate_dir}/${LIBOMPTARGET_CUDA_LIBDEVICE_SUBDIR}") + set(CUDA_TOOLKIT_ROOT_DIR "${candidate_dir}" CACHE PATH + "Toolkit location." FORCE) + endif() + endif() +endif() diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake index 6ec0cc2b61bc0..6128618bf487d 100644 --- a/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetNVPTXBitcodeLibrary.cmake @@ -1,111 +1,111 @@ -# -#//===----------------------------------------------------------------------===// -#// -#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -#// See https://llvm.org/LICENSE.txt for license information. -#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#// -#//===----------------------------------------------------------------------===// -# - -# We use the compiler and linker provided by the user, attempt to use the one -# used to build libomptarget or just fail. -set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE) - -if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) -elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) -else() - return() -endif() - -# Get compiler directory to try to locate a suitable linker. -get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY) -set(llvm_link "${compiler_dir}/llvm-link") - -if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") - set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) -elseif (EXISTS "${llvm_link}") - # Use llvm-link from the compiler directory. - set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}") -else() - return() -endif() - -function(try_compile_bitcode output source) - set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu) - file(WRITE ${srcfile} "${source}\n") - set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc) - - # The remaining arguments are the flags to be tested. - # FIXME: Don't hardcode GPU version. This is currently required because - # Clang refuses to compile its default of sm_20 with CUDA 9. - execute_process( - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN} - --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile} - RESULT_VARIABLE result - OUTPUT_QUIET ERROR_QUIET) - if (result EQUAL 0) - set(${output} TRUE PARENT_SCOPE) - else() - set(${output} FALSE PARENT_SCOPE) - endif() -endfunction() - -# Save for which compiler we are going to do the following checks so that we -# can discard cached values if the user specifies a different value. -set(discard_cached FALSE) -if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND - NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}")) - set(discard_cached TRUE) -endif() -set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE) - -function(check_bitcode_compilation output source) - if (${discard_cached} OR NOT DEFINED ${output}) - message(STATUS "Performing Test ${output}") - # Forward additional arguments which contain the flags. - try_compile_bitcode(result "${source}" ${ARGN}) - set(${output} ${result} CACHE INTERNAL "" FORCE) - if(${result}) - message(STATUS "Performing Test ${output} - Success") - else() - message(STATUS "Performing Test ${output} - Failed") - endif() - endif() -endfunction() - -# These flags are required to emit LLVM Bitcode. We check them together because -# if any of them are not supported, there is no point in finding out which are. -set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) -set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }") -check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required}) - -# It makes no sense to continue given that the compiler doesn't support -# emitting basic LLVM Bitcode -if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED) - return() -endif() - -set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required}) - -# Declaring external shared device variables might need an additional flag -# since Clang 7.0 and was entirely unsupported since version 4.0. -set(extern_device_shared_src "extern __device__ __shared__ int test;") - -check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}) -if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED) - set(compiler_flag_fcuda_rdc -fcuda-rdc) - set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc}) - check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full}) - - if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC) - return() - endif() - - set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}") -endif() - -# We can compile LLVM Bitcode from CUDA source code! -set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE) +# +#//===----------------------------------------------------------------------===// +#// +#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +#// See https://llvm.org/LICENSE.txt for license information. +#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#// +#//===----------------------------------------------------------------------===// +# + +# We use the compiler and linker provided by the user, attempt to use the one +# used to build libomptarget or just fail. +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED FALSE) + +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${LIBOMPTARGET_NVPTX_CUDA_COMPILER}) +elseif(${CMAKE_C_COMPILER_ID} STREQUAL "Clang") + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER ${CMAKE_C_COMPILER}) +else() + return() +endif() + +# Get compiler directory to try to locate a suitable linker. +get_filename_component(compiler_dir ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} DIRECTORY) +set(llvm_link "${compiler_dir}/llvm-link") + +if (NOT LIBOMPTARGET_NVPTX_BC_LINKER STREQUAL "") + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER ${LIBOMPTARGET_NVPTX_BC_LINKER}) +elseif (EXISTS "${llvm_link}") + # Use llvm-link from the compiler directory. + set(LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER "${llvm_link}") +else() + return() +endif() + +function(try_compile_bitcode output source) + set(srcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/src.cu) + file(WRITE ${srcfile} "${source}\n") + set(bcfile ${CMAKE_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/out.bc) + + # The remaining arguments are the flags to be tested. + # FIXME: Don't hardcode GPU version. This is currently required because + # Clang refuses to compile its default of sm_20 with CUDA 9. + execute_process( + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${ARGN} + --cuda-gpu-arch=sm_35 -c ${srcfile} -o ${bcfile} + RESULT_VARIABLE result + OUTPUT_QUIET ERROR_QUIET) + if (result EQUAL 0) + set(${output} TRUE PARENT_SCOPE) + else() + set(${output} FALSE PARENT_SCOPE) + endif() +endfunction() + +# Save for which compiler we are going to do the following checks so that we +# can discard cached values if the user specifies a different value. +set(discard_cached FALSE) +if (DEFINED LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER AND + NOT("${LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER}" STREQUAL "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}")) + set(discard_cached TRUE) +endif() +set(LIBOMPTARGET_NVPTX_CHECKED_CUDA_COMPILER "${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER}" CACHE INTERNAL "" FORCE) + +function(check_bitcode_compilation output source) + if (${discard_cached} OR NOT DEFINED ${output}) + message(STATUS "Performing Test ${output}") + # Forward additional arguments which contain the flags. + try_compile_bitcode(result "${source}" ${ARGN}) + set(${output} ${result} CACHE INTERNAL "" FORCE) + if(${result}) + message(STATUS "Performing Test ${output} - Success") + else() + message(STATUS "Performing Test ${output} - Failed") + endif() + endif() +endfunction() + +# These flags are required to emit LLVM Bitcode. We check them together because +# if any of them are not supported, there is no point in finding out which are. +set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++14 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) +set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }") +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required}) + +# It makes no sense to continue given that the compiler doesn't support +# emitting basic LLVM Bitcode +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED) + return() +endif() + +set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS ${compiler_flags_required}) + +# Declaring external shared device variables might need an additional flag +# since Clang 7.0 and was entirely unsupported since version 4.0. +set(extern_device_shared_src "extern __device__ __shared__ int test;") + +check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED "${extern_device_shared_src}" ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS}) +if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_EXTERN_SHARED) + set(compiler_flag_fcuda_rdc -fcuda-rdc) + set(compiler_flag_fcuda_rdc_full ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} ${compiler_flag_fcuda_rdc}) + check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC "${extern_device_shared_src}" ${compiler_flag_fcuda_rdc_full}) + + if (NOT LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FCUDA_RDC) + return() + endif() + + set(LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS "${compiler_flag_fcuda_rdc_full}") +endif() + +# We can compile LLVM Bitcode from CUDA source code! +set(LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED TRUE) diff --git a/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake b/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake index 7339cc0b56edd..1f686067a838c 100644 --- a/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake +++ b/openmp/libomptarget/cmake/Modules/LibomptargetUtils.cmake @@ -1,27 +1,27 @@ -# -#//===----------------------------------------------------------------------===// -#// -#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -#// See https://llvm.org/LICENSE.txt for license information. -#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -#// -#//===----------------------------------------------------------------------===// -# - -# void libomptarget_say(string message_to_user); -# - prints out message_to_user -macro(libomptarget_say message_to_user) - message(STATUS "LIBOMPTARGET: ${message_to_user}") -endmacro() - -# void libomptarget_warning_say(string message_to_user); -# - prints out message_to_user with a warning -macro(libomptarget_warning_say message_to_user) - message(WARNING "LIBOMPTARGET: ${message_to_user}") -endmacro() - -# void libomptarget_error_say(string message_to_user); -# - prints out message_to_user with an error and exits cmake -macro(libomptarget_error_say message_to_user) - message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}") -endmacro() +# +#//===----------------------------------------------------------------------===// +#// +#// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +#// See https://llvm.org/LICENSE.txt for license information. +#// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#// +#//===----------------------------------------------------------------------===// +# + +# void libomptarget_say(string message_to_user); +# - prints out message_to_user +macro(libomptarget_say message_to_user) + message(STATUS "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_warning_say(string message_to_user); +# - prints out message_to_user with a warning +macro(libomptarget_warning_say message_to_user) + message(WARNING "LIBOMPTARGET: ${message_to_user}") +endmacro() + +# void libomptarget_error_say(string message_to_user); +# - prints out message_to_user with an error and exits cmake +macro(libomptarget_error_say message_to_user) + message(FATAL_ERROR "LIBOMPTARGET: ${message_to_user}") +endmacro() diff --git a/openmp/libomptarget/deviceRTLs/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/CMakeLists.txt index 3df94eac0727e..9761b8b262ee9 100644 --- a/openmp/libomptarget/deviceRTLs/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/CMakeLists.txt @@ -1,13 +1,13 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -# ##===----------------------------------------------------------------------===## -# -# Build a device RTL for each available machine. -# -##===----------------------------------------------------------------------===## - -add_subdirectory(nvptx) +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# ##===----------------------------------------------------------------------===## +# +# Build a device RTL for each available machine. +# +##===----------------------------------------------------------------------===## + +add_subdirectory(nvptx) diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt index 1a24bfd6f8876..47d8380ac1157 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/amdgcn/CMakeLists.txt @@ -1,153 +1,153 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build the AMDGCN Device RTL if the ROCM tools are available -# -##===----------------------------------------------------------------------===## - -find_package(LLVM QUIET CONFIG - PATHS - $ENV{AOMP} - $ENV{HOME}/rocm/aomp - /opt/rocm/aomp - /usr/lib/rocm/aomp - ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR} - ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR} - ${CMAKE_CXX_COMPILER_DIR} - NO_DEFAULT_PATH) - -if (LLVM_DIR) - libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake") -else() - libomptarget_say("Not building AMDGCN device RTL: AOMP not found") - return() -endif() - -set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX}) - -if (AOMP_INSTALL_PREFIX) - set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin) -else() - set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin) -endif() - -libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}") - -project(omptarget-amdgcn) - -add_custom_target(omptarget-amdgcn ALL) - -#optimization level -set(optimization_level 2) - -# Activate RTL message dumps if requested by the user. -if(LIBOMPTARGET_NVPTX_DEBUG) - set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1) -endif() - -get_filename_component(devicertl_base_directory - ${CMAKE_CURRENT_SOURCE_DIR} - DIRECTORY) - -set(cuda_sources - ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip - ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip - ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip - ${devicertl_base_directory}/common/src/cancel.cu - ${devicertl_base_directory}/common/src/critical.cu - ${devicertl_base_directory}/common/src/data_sharing.cu - ${devicertl_base_directory}/common/src/libcall.cu - ${devicertl_base_directory}/common/src/loop.cu - ${devicertl_base_directory}/common/src/omp_data.cu - ${devicertl_base_directory}/common/src/omptarget.cu - ${devicertl_base_directory}/common/src/parallel.cu - ${devicertl_base_directory}/common/src/reduction.cu - ${devicertl_base_directory}/common/src/support.cu - ${devicertl_base_directory}/common/src/sync.cu - ${devicertl_base_directory}/common/src/task.cu) - -set(h_files - ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h - ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h - ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h - ${devicertl_base_directory}/common/debug.h - ${devicertl_base_directory}/common/device_environment.h - ${devicertl_base_directory}/common/omptarget.h - ${devicertl_base_directory}/common/omptargeti.h - ${devicertl_base_directory}/common/state-queue.h - ${devicertl_base_directory}/common/target_atomic.h - ${devicertl_base_directory}/common/state-queuei.h - ${devicertl_base_directory}/common/support.h) - -# for both in-tree and out-of-tree build -if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY) - set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR}) -else() - set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}) -endif() - -# create libraries -set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900) -if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST) - set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) -endif() - -macro(add_cuda_bc_library) - set(cu_cmd ${AOMP_BINDIR}/clang++ - -std=c++14 - -fcuda-rdc - -fvisibility=default - --cuda-device-only - -Wno-unused-value - -x hip - -O${optimization_level} - --cuda-gpu-arch=${mcpu} - ${CUDA_DEBUG} - -I${CMAKE_CURRENT_SOURCE_DIR}/src - -I${devicertl_base_directory}) - - set(bc1_files) - - foreach(file ${ARGN}) - get_filename_component(fname ${file} NAME_WE) - set(bc1_filename ${fname}.${mcpu}.bc) - - add_custom_command( - OUTPUT ${bc1_filename} - COMMAND ${cu_cmd} ${file} -o ${bc1_filename} - DEPENDS ${file} ${h_files}) - - list(APPEND bc1_files ${bc1_filename}) - endforeach() - - add_custom_command( - OUTPUT linkout.cuda.${mcpu}.bc - COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc - DEPENDS ${bc1_files}) - - list(APPEND bc_files linkout.cuda.${mcpu}.bc) -endmacro() - -set(libname "omptarget-amdgcn") - -foreach(mcpu ${mcpus}) - set(bc_files) - add_cuda_bc_library(${cuda_sources}) - - set(bc_libname lib${libname}-${mcpu}.bc) - add_custom_command( - OUTPUT ${bc_libname} - COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname} - DEPENDS ${bc_files}) - - add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname}) - - install(FILES ${OUTPUTDIR}/${bc_libname} - DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice" - ) -endforeach() +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the AMDGCN Device RTL if the ROCM tools are available +# +##===----------------------------------------------------------------------===## + +find_package(LLVM QUIET CONFIG + PATHS + $ENV{AOMP} + $ENV{HOME}/rocm/aomp + /opt/rocm/aomp + /usr/lib/rocm/aomp + ${LIBOMPTARGET_NVPTX_CUDA_COMPILER_DIR} + ${LIBOMPTARGET_NVPTX_CUDA_LINKER_DIR} + ${CMAKE_CXX_COMPILER_DIR} + NO_DEFAULT_PATH) + +if (LLVM_DIR) + libomptarget_say("Found LLVM ${LLVM_PACKAGE_VERSION}. Configure: ${LLVM_DIR}/LLVMConfig.cmake") +else() + libomptarget_say("Not building AMDGCN device RTL: AOMP not found") + return() +endif() + +set(AOMP_INSTALL_PREFIX ${LLVM_INSTALL_PREFIX}) + +if (AOMP_INSTALL_PREFIX) + set(AOMP_BINDIR ${AOMP_INSTALL_PREFIX}/bin) +else() + set(AOMP_BINDIR ${LLVM_BUILD_BINARY_DIR}/bin) +endif() + +libomptarget_say("Building AMDGCN device RTL. LLVM_COMPILER_PATH=${AOMP_BINDIR}") + +project(omptarget-amdgcn) + +add_custom_target(omptarget-amdgcn ALL) + +#optimization level +set(optimization_level 2) + +# Activate RTL message dumps if requested by the user. +if(LIBOMPTARGET_NVPTX_DEBUG) + set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1) +endif() + +get_filename_component(devicertl_base_directory + ${CMAKE_CURRENT_SOURCE_DIR} + DIRECTORY) + +set(cuda_sources + ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_smid.hip + ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_locks.hip + ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.hip + ${devicertl_base_directory}/common/src/cancel.cu + ${devicertl_base_directory}/common/src/critical.cu + ${devicertl_base_directory}/common/src/data_sharing.cu + ${devicertl_base_directory}/common/src/libcall.cu + ${devicertl_base_directory}/common/src/loop.cu + ${devicertl_base_directory}/common/src/omp_data.cu + ${devicertl_base_directory}/common/src/omptarget.cu + ${devicertl_base_directory}/common/src/parallel.cu + ${devicertl_base_directory}/common/src/reduction.cu + ${devicertl_base_directory}/common/src/support.cu + ${devicertl_base_directory}/common/src/sync.cu + ${devicertl_base_directory}/common/src/task.cu) + +set(h_files + ${CMAKE_CURRENT_SOURCE_DIR}/src/amdgcn_interface.h + ${CMAKE_CURRENT_SOURCE_DIR}/src/hip_atomics.h + ${CMAKE_CURRENT_SOURCE_DIR}/src/target_impl.h + ${devicertl_base_directory}/common/debug.h + ${devicertl_base_directory}/common/device_environment.h + ${devicertl_base_directory}/common/omptarget.h + ${devicertl_base_directory}/common/omptargeti.h + ${devicertl_base_directory}/common/state-queue.h + ${devicertl_base_directory}/common/target_atomic.h + ${devicertl_base_directory}/common/state-queuei.h + ${devicertl_base_directory}/common/support.h) + +# for both in-tree and out-of-tree build +if (NOT CMAKE_ARCHIVE_OUTPUT_DIRECTORY) + set(OUTPUTDIR ${CMAKE_CURRENT_BINARY_DIR}) +else() + set(OUTPUTDIR ${CMAKE_ARCHIVE_OUTPUT_DIRECTORY}) +endif() + +# create libraries +set(mcpus gfx700 gfx701 gfx801 gfx803 gfx900) +if (DEFINED LIBOMPTARGET_AMDGCN_GFXLIST) + set(mcpus ${LIBOMPTARGET_AMDGCN_GFXLIST}) +endif() + +macro(add_cuda_bc_library) + set(cu_cmd ${AOMP_BINDIR}/clang++ + -std=c++14 + -fcuda-rdc + -fvisibility=default + --cuda-device-only + -Wno-unused-value + -x hip + -O${optimization_level} + --cuda-gpu-arch=${mcpu} + ${CUDA_DEBUG} + -I${CMAKE_CURRENT_SOURCE_DIR}/src + -I${devicertl_base_directory}) + + set(bc1_files) + + foreach(file ${ARGN}) + get_filename_component(fname ${file} NAME_WE) + set(bc1_filename ${fname}.${mcpu}.bc) + + add_custom_command( + OUTPUT ${bc1_filename} + COMMAND ${cu_cmd} ${file} -o ${bc1_filename} + DEPENDS ${file} ${h_files}) + + list(APPEND bc1_files ${bc1_filename}) + endforeach() + + add_custom_command( + OUTPUT linkout.cuda.${mcpu}.bc + COMMAND ${AOMP_BINDIR}/llvm-link ${bc1_files} -o linkout.cuda.${mcpu}.bc + DEPENDS ${bc1_files}) + + list(APPEND bc_files linkout.cuda.${mcpu}.bc) +endmacro() + +set(libname "omptarget-amdgcn") + +foreach(mcpu ${mcpus}) + set(bc_files) + add_cuda_bc_library(${cuda_sources}) + + set(bc_libname lib${libname}-${mcpu}.bc) + add_custom_command( + OUTPUT ${bc_libname} + COMMAND ${AOMP_BINDIR}/llvm-link ${bc_files} | ${AOMP_BINDIR}/opt --always-inline -o ${OUTPUTDIR}/${bc_libname} + DEPENDS ${bc_files}) + + add_custom_target(lib${libname}-${mcpu} ALL DEPENDS ${bc_libname}) + + install(FILES ${OUTPUTDIR}/${bc_libname} + DESTINATION "${OPENMP_INSTALL_LIBDIR}/libdevice" + ) +endforeach() diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h index f7c75c09362a2..e1042e0367217 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_interface.h @@ -1,18 +1,18 @@ -//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _AMDGCN_INTERFACE_H_ -#define _AMDGCN_INTERFACE_H_ - -#include - -#define EXTERN extern "C" __attribute__((device)) -typedef uint64_t __kmpc_impl_lanemask_t; -typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ - -#endif +//===--- amdgcn_interface.h - OpenMP interface definitions ------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _AMDGCN_INTERFACE_H_ +#define _AMDGCN_INTERFACE_H_ + +#include + +#define EXTERN extern "C" __attribute__((device)) +typedef uint64_t __kmpc_impl_lanemask_t; +typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ + +#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip index 4163a14f50bf1..c64200d4289fa 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_locks.hip @@ -1,28 +1,28 @@ -//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock -// cannot be implemented - if one thread gets the lock, it can't continue on to -// the next instruction in order to do anything as the other threads are waiting -// to take the lock. -// These functions will be implemented to provide the documented semantics for -// a SIMD => wavefront mapping once that is implemented. -// -//===----------------------------------------------------------------------===// - -#include "common/debug.h" - -static DEVICE void warn() { - PRINT0(LD_ALL, "Locks are not supported in this thread mapping model"); -} - -DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); } -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); } -DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); } +//===-- amdgcn_locks.hip - AMDGCN OpenMP GPU lock implementation -- HIP -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// A 'thread' maps onto a lane of the wavefront. This means a per-thread lock +// cannot be implemented - if one thread gets the lock, it can't continue on to +// the next instruction in order to do anything as the other threads are waiting +// to take the lock. +// These functions will be implemented to provide the documented semantics for +// a SIMD => wavefront mapping once that is implemented. +// +//===----------------------------------------------------------------------===// + +#include "common/debug.h" + +static DEVICE void warn() { + PRINT0(LD_ALL, "Locks are not supported in this thread mapping model"); +} + +DEVICE void __kmpc_impl_init_lock(omp_lock_t *) { warn(); } +DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *) { warn(); } +DEVICE void __kmpc_impl_set_lock(omp_lock_t *) { warn(); } +DEVICE void __kmpc_impl_unset_lock(omp_lock_t *) { warn(); } +DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock) { warn(); } diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip index 74d0d167137fb..87f02d51cfca7 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/amdgcn_smid.hip @@ -1,61 +1,61 @@ -//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "target_impl.h" - -// Partially derived fom hcc_detail/device_functions.h - -// HW_ID Register bit structure -// WAVE_ID 3:0 Wave buffer slot number. 0-9. -// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU. -// PIPE_ID 7:6 Pipeline from which the wave was dispatched. -// CU_ID 11:8 Compute Unit the wave is assigned to. -// SH_ID 12 Shader Array (within an SE) the wave is assigned to. -// SE_ID 14:13 Shader Engine the wave is assigned to. -// TG_ID 19:16 Thread-group ID -// VM_ID 23:20 Virtual Memory ID -// QUEUE_ID 26:24 Queue from which this wave was dispatched. -// STATE_ID 29:27 State ID (graphics only, not compute). -// ME_ID 31:30 Micro-engine ID. - -enum { - HW_ID = 4, // specify that the hardware register to read is HW_ID - - HW_ID_CU_ID_SIZE = 4, // size of CU_ID field in bits - HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register - - HW_ID_SE_ID_SIZE = 2, // sizeof SE_ID field in bits - HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register -}; - -// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit -// immediate and returns a 32 bit value. -// The encoding of the immediate parameter is: -// ID 5:0 Which register to read from -// OFFSET 10:6 Range: 0..31 -// WIDTH 15:11 Range: 1..32 - -// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width) -// where hwreg forms a 16 bit immediate encoded by the assembler thus: -// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { -// return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11); -// } -#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11)) - -// Note: The results can be changed by a context switch -// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper -// bound on how many compute units are available. Some values in this -// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs. - -DEVICE uint32_t __kmpc_impl_smid() { - uint32_t cu_id = __builtin_amdgcn_s_getreg( - ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); - uint32_t se_id = __builtin_amdgcn_s_getreg( - ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); - return (se_id << HW_ID_CU_ID_SIZE) + cu_id; -} +//===-------- amdgcn_smid.hip - AMDGCN smid implementation -------- HIP -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "target_impl.h" + +// Partially derived fom hcc_detail/device_functions.h + +// HW_ID Register bit structure +// WAVE_ID 3:0 Wave buffer slot number. 0-9. +// SIMD_ID 5:4 SIMD which the wave is assigned to within the CU. +// PIPE_ID 7:6 Pipeline from which the wave was dispatched. +// CU_ID 11:8 Compute Unit the wave is assigned to. +// SH_ID 12 Shader Array (within an SE) the wave is assigned to. +// SE_ID 14:13 Shader Engine the wave is assigned to. +// TG_ID 19:16 Thread-group ID +// VM_ID 23:20 Virtual Memory ID +// QUEUE_ID 26:24 Queue from which this wave was dispatched. +// STATE_ID 29:27 State ID (graphics only, not compute). +// ME_ID 31:30 Micro-engine ID. + +enum { + HW_ID = 4, // specify that the hardware register to read is HW_ID + + HW_ID_CU_ID_SIZE = 4, // size of CU_ID field in bits + HW_ID_CU_ID_OFFSET = 8, // offset of CU_ID from start of register + + HW_ID_SE_ID_SIZE = 2, // sizeof SE_ID field in bits + HW_ID_SE_ID_OFFSET = 13, // offset of SE_ID from start of register +}; + +// The s_getreg_b32 instruction, exposed as an intrinsic, takes a 16 bit +// immediate and returns a 32 bit value. +// The encoding of the immediate parameter is: +// ID 5:0 Which register to read from +// OFFSET 10:6 Range: 0..31 +// WIDTH 15:11 Range: 1..32 + +// The asm equivalent is s_getreg_b32 %0, hwreg(HW_REG_HW_ID, Offset, Width) +// where hwreg forms a 16 bit immediate encoded by the assembler thus: +// uint64_t encodeHwreg(uint64_t Id, uint64_t Offset, uint64_t Width) { +// return (Id << 0_) | (Offset << 6) | ((Width - 1) << 11); +// } +#define ENCODE_HWREG(WIDTH, OFF, REG) (REG | (OFF << 6) | ((WIDTH - 1) << 11)) + +// Note: The results can be changed by a context switch +// Return value in [0 2^SE_ID_SIZE * 2^CU_ID_SIZE), which is an upper +// bound on how many compute units are available. Some values in this +// range may never be returned if there are fewer than 2^CU_ID_SIZE CUs. + +DEVICE uint32_t __kmpc_impl_smid() { + uint32_t cu_id = __builtin_amdgcn_s_getreg( + ENCODE_HWREG(HW_ID_CU_ID_SIZE, HW_ID_CU_ID_OFFSET, HW_ID)); + uint32_t se_id = __builtin_amdgcn_s_getreg( + ENCODE_HWREG(HW_ID_SE_ID_SIZE, HW_ID_SE_ID_OFFSET, HW_ID)); + return (se_id << HW_ID_CU_ID_SIZE) + cu_id; +} diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h index df102c765925c..312003d902d0d 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/hip_atomics.h @@ -1,42 +1,42 @@ -//===---- hip_atomics.h - Declarations of hip atomic functions ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_AMDGCN_HIP_ATOMICS_H -#define OMPTARGET_AMDGCN_HIP_ATOMICS_H - -#include "target_impl.h" - -// inc requires an amdgcn specific intrinsic which is not yet available -DEVICE unsigned atomicInc(unsigned *address); -DEVICE unsigned atomicInc(unsigned *address, unsigned max); -DEVICE int atomicInc(int *address); - -namespace { - -template DEVICE T atomicAdd(T *address, T val) { - return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST); -} - -template DEVICE T atomicMax(T *address, T val) { - return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST); -} - -template DEVICE T atomicExch(T *address, T val) { - T r; - __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST); - return r; -} - -template DEVICE T atomicCAS(T *address, T compare, T val) { - (void)__atomic_compare_exchange(address, &compare, &val, false, - __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); - return compare; -} - -} // namespace -#endif +//===---- hip_atomics.h - Declarations of hip atomic functions ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_AMDGCN_HIP_ATOMICS_H +#define OMPTARGET_AMDGCN_HIP_ATOMICS_H + +#include "target_impl.h" + +// inc requires an amdgcn specific intrinsic which is not yet available +DEVICE unsigned atomicInc(unsigned *address); +DEVICE unsigned atomicInc(unsigned *address, unsigned max); +DEVICE int atomicInc(int *address); + +namespace { + +template DEVICE T atomicAdd(T *address, T val) { + return __atomic_fetch_add(address, val, __ATOMIC_SEQ_CST); +} + +template DEVICE T atomicMax(T *address, T val) { + return __atomic_fetch_max(address, val, __ATOMIC_SEQ_CST); +} + +template DEVICE T atomicExch(T *address, T val) { + T r; + __atomic_exchange(address, &val, &r, __ATOMIC_SEQ_CST); + return r; +} + +template DEVICE T atomicCAS(T *address, T compare, T val) { + (void)__atomic_compare_exchange(address, &compare, &val, false, + __ATOMIC_SEQ_CST, __ATOMIC_RELAXED); + return compare; +} + +} // namespace +#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h index 04755a6a3e73f..94a12a248a2fc 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.h @@ -1,155 +1,155 @@ -//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations and definitions of target specific functions and constants -// -//===----------------------------------------------------------------------===// -#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H -#define OMPTARGET_AMDGCN_TARGET_IMPL_H - -#ifndef __AMDGCN__ -#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__" -#endif - -#include "amdgcn_interface.h" - -#include -#include -#include -#include - -#define DEVICE __attribute__((device)) -#define INLINE inline DEVICE -#define NOINLINE __attribute__((noinline)) DEVICE -#define SHARED __attribute__((shared)) -#define ALIGN(N) __attribute__((aligned(N))) - -#include "hip_atomics.h" - -//////////////////////////////////////////////////////////////////////////////// -// Kernel options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// The following def must match the absolute limit hardwired in the host RTL -// max number of threads per team -#define MAX_THREADS_PER_TEAM 1024 - -#define WARPSIZE 64 - -// The named barrier for active parallel threads of a team in an L1 parallel -// region to synchronize with each other. -#define L1_BARRIER (1) - -// Maximum number of preallocated arguments to an outlined parallel/simd -// function. Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 - -// Maximum number of omp state objects per SM allocated statically in global -// memory. -#define OMP_STATE_COUNT 32 -#define MAX_SM 64 - -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - -// Data sharing related quantities, need to match what is used in the compiler. -enum DATA_SHARING_SIZES { - // The maximum number of workers in a kernel. - DS_Max_Worker_Threads = 960, - // The size reserved for data in a shared memory slot. - DS_Slot_Size = 256, - // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, - // The maximum number of warps in use - DS_Max_Warp_Number = 16, -}; - -INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { - lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); - hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); -} - -INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { - return (((uint64_t)hi) << 32) | (uint64_t)lo; -} - -static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes = - UINT64_C(0xffffffffffffffff); - -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); - -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); - -DEVICE uint32_t __kmpc_impl_smid(); - -DEVICE double __kmpc_impl_get_wtick(); - -DEVICE double __kmpc_impl_get_wtime(); - -INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } - -INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } - -template INLINE T __kmpc_impl_min(T x, T y) { - return x < y ? x : y; -} - -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); - -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, - int32_t SrcLane); - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, - uint32_t Delta, int32_t Width); - -INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } - -INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { - // AMDGCN doesn't need to sync threads in a warp -} - -INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) { - // we have protected the master warp from releasing from its barrier - // due to a full workgroup barrier in the middle of a work function. - // So it is ok to issue a full workgroup barrier here. - __builtin_amdgcn_s_barrier(); -} - -DEVICE void __kmpc_impl_threadfence(void); -DEVICE void __kmpc_impl_threadfence_block(void); -DEVICE void __kmpc_impl_threadfence_system(void); - -// Calls to the AMDGCN layer (assuming 1D layout) -INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } -INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } -DEVICE int GetNumberOfBlocksInKernel(); -DEVICE int GetNumberOfThreadsInBlock(); -DEVICE unsigned GetWarpId(); -DEVICE unsigned GetLaneId(); - -DEVICE bool __kmpc_impl_is_first_active_thread(); - -// Locks -DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock); -DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock); -DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock); - -// Memory -DEVICE void *__kmpc_impl_malloc(size_t x); -DEVICE void __kmpc_impl_free(void *x); - -// DEVICE versions of part of libc -INLINE void __assert_fail(const char *, const char *, unsigned int, - const char *) { - __builtin_trap(); -} -EXTERN int printf(const char *, ...); - -#endif +//===------- target_impl.h - AMDGCN OpenMP GPU implementation ----- HIP -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declarations and definitions of target specific functions and constants +// +//===----------------------------------------------------------------------===// +#ifndef OMPTARGET_AMDGCN_TARGET_IMPL_H +#define OMPTARGET_AMDGCN_TARGET_IMPL_H + +#ifndef __AMDGCN__ +#error "amdgcn target_impl.h expects to be compiled under __AMDGCN__" +#endif + +#include "amdgcn_interface.h" + +#include +#include +#include +#include + +#define DEVICE __attribute__((device)) +#define INLINE inline DEVICE +#define NOINLINE __attribute__((noinline)) DEVICE +#define SHARED __attribute__((shared)) +#define ALIGN(N) __attribute__((aligned(N))) + +#include "hip_atomics.h" + +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// The following def must match the absolute limit hardwired in the host RTL +// max number of threads per team +#define MAX_THREADS_PER_TEAM 1024 + +#define WARPSIZE 64 + +// The named barrier for active parallel threads of a team in an L1 parallel +// region to synchronize with each other. +#define L1_BARRIER (1) + +// Maximum number of preallocated arguments to an outlined parallel/simd +// function. Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + +// Maximum number of omp state objects per SM allocated statically in global +// memory. +#define OMP_STATE_COUNT 32 +#define MAX_SM 64 + +#define OMP_ACTIVE_PARALLEL_LEVEL 128 + +// Data sharing related quantities, need to match what is used in the compiler. +enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 960, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 16, +}; + +INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { + lo = (uint32_t)(val & UINT64_C(0x00000000FFFFFFFF)); + hi = (uint32_t)((val & UINT64_C(0xFFFFFFFF00000000)) >> 32); +} + +INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { + return (((uint64_t)hi) << 32) | (uint64_t)lo; +} + +static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes = + UINT64_C(0xffffffffffffffff); + +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt(); + +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt(); + +DEVICE uint32_t __kmpc_impl_smid(); + +DEVICE double __kmpc_impl_get_wtick(); + +DEVICE double __kmpc_impl_get_wtime(); + +INLINE uint64_t __kmpc_impl_ffs(uint64_t x) { return __builtin_ffsl(x); } + +INLINE uint64_t __kmpc_impl_popc(uint64_t x) { return __builtin_popcountl(x); } + +template INLINE T __kmpc_impl_min(T x, T y) { + return x < y ? x : y; +} + +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask(); + +DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t Var, + int32_t SrcLane); + +DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t Var, + uint32_t Delta, int32_t Width); + +INLINE void __kmpc_impl_syncthreads() { __builtin_amdgcn_s_barrier(); } + +INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t) { + // AMDGCN doesn't need to sync threads in a warp +} + +INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) { + // we have protected the master warp from releasing from its barrier + // due to a full workgroup barrier in the middle of a work function. + // So it is ok to issue a full workgroup barrier here. + __builtin_amdgcn_s_barrier(); +} + +DEVICE void __kmpc_impl_threadfence(void); +DEVICE void __kmpc_impl_threadfence_block(void); +DEVICE void __kmpc_impl_threadfence_system(void); + +// Calls to the AMDGCN layer (assuming 1D layout) +INLINE int GetThreadIdInBlock() { return __builtin_amdgcn_workitem_id_x(); } +INLINE int GetBlockIdInKernel() { return __builtin_amdgcn_workgroup_id_x(); } +DEVICE int GetNumberOfBlocksInKernel(); +DEVICE int GetNumberOfThreadsInBlock(); +DEVICE unsigned GetWarpId(); +DEVICE unsigned GetLaneId(); + +DEVICE bool __kmpc_impl_is_first_active_thread(); + +// Locks +DEVICE void __kmpc_impl_init_lock(omp_lock_t *lock); +DEVICE void __kmpc_impl_destroy_lock(omp_lock_t *lock); +DEVICE void __kmpc_impl_set_lock(omp_lock_t *lock); +DEVICE void __kmpc_impl_unset_lock(omp_lock_t *lock); +DEVICE int __kmpc_impl_test_lock(omp_lock_t *lock); + +// Memory +DEVICE void *__kmpc_impl_malloc(size_t x); +DEVICE void __kmpc_impl_free(void *x); + +// DEVICE versions of part of libc +INLINE void __assert_fail(const char *, const char *, unsigned int, + const char *) { + __builtin_trap(); +} +EXTERN int printf(const char *, ...); + +#endif diff --git a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip index 9807483d4c420..aca9daad12143 100644 --- a/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip +++ b/openmp/libomptarget/deviceRTLs/amdgcn/src/target_impl.hip @@ -1,72 +1,72 @@ -//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definitions of target specific functions -// -//===----------------------------------------------------------------------===// - -#include "target_impl.h" - -// Implementations initially derived from hcc - -// Initialized with a 64-bit mask with bits set in positions less than the -// thread's lane number in the warp -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { - uint32_t lane = GetLaneId(); - int64_t ballot = __kmpc_impl_activemask(); - uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; - return mask & ballot; -} - -// Initialized with a 64-bit mask with bits set in positions greater than the -// thread's lane number in the warp -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { - uint32_t lane = GetLaneId(); - if (lane == (WARPSIZE - 1)) - return 0; - uint64_t ballot = __kmpc_impl_activemask(); - uint64_t mask = (~((uint64_t)0)) << (lane + 1); - return mask & ballot; -} - -DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); } - -EXTERN uint64_t __clock64(); -DEVICE double __kmpc_impl_get_wtime() { - return ((double)1.0 / 745000000.0) * __clock64(); -} - -// Warp vote function -DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { - return __builtin_amdgcn_read_exec(); -} - -DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, - int32_t srcLane) { - int width = WARPSIZE; - int self = GetLaneId(); - int index = srcLane + (self & ~(width - 1)); - return __builtin_amdgcn_ds_bpermute(index << 2, var); -} - -DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, - uint32_t laneDelta, int32_t width) { - int self = GetLaneId(); - int index = self + laneDelta; - index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index; - return __builtin_amdgcn_ds_bpermute(index << 2, var); -} - -EXTERN uint64_t __ockl_get_local_size(uint32_t); -EXTERN uint64_t __ockl_get_num_groups(uint32_t); -DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); } -DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); } -DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } -DEVICE unsigned GetLaneId() { - return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); -} +//===------- target_impl.hip - AMDGCN OpenMP GPU implementation --- HIP -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definitions of target specific functions +// +//===----------------------------------------------------------------------===// + +#include "target_impl.h" + +// Implementations initially derived from hcc + +// Initialized with a 64-bit mask with bits set in positions less than the +// thread's lane number in the warp +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { + uint32_t lane = GetLaneId(); + int64_t ballot = __kmpc_impl_activemask(); + uint64_t mask = ((uint64_t)1 << lane) - (uint64_t)1; + return mask & ballot; +} + +// Initialized with a 64-bit mask with bits set in positions greater than the +// thread's lane number in the warp +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { + uint32_t lane = GetLaneId(); + if (lane == (WARPSIZE - 1)) + return 0; + uint64_t ballot = __kmpc_impl_activemask(); + uint64_t mask = (~((uint64_t)0)) << (lane + 1); + return mask & ballot; +} + +DEVICE double __kmpc_impl_get_wtick() { return ((double)1E-9); } + +EXTERN uint64_t __clock64(); +DEVICE double __kmpc_impl_get_wtime() { + return ((double)1.0 / 745000000.0) * __clock64(); +} + +// Warp vote function +DEVICE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { + return __builtin_amdgcn_read_exec(); +} + +DEVICE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t, int32_t var, + int32_t srcLane) { + int width = WARPSIZE; + int self = GetLaneId(); + int index = srcLane + (self & ~(width - 1)); + return __builtin_amdgcn_ds_bpermute(index << 2, var); +} + +DEVICE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t, int32_t var, + uint32_t laneDelta, int32_t width) { + int self = GetLaneId(); + int index = self + laneDelta; + index = (int)(laneDelta + (self & (width - 1))) >= width ? self : index; + return __builtin_amdgcn_ds_bpermute(index << 2, var); +} + +EXTERN uint64_t __ockl_get_local_size(uint32_t); +EXTERN uint64_t __ockl_get_num_groups(uint32_t); +DEVICE int GetNumberOfBlocksInKernel() { return __ockl_get_num_groups(0); } +DEVICE int GetNumberOfThreadsInBlock() { return __ockl_get_local_size(0); } +DEVICE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +DEVICE unsigned GetLaneId() { + return __builtin_amdgcn_mbcnt_hi(~0u, __builtin_amdgcn_mbcnt_lo(~0u, 0u)); +} diff --git a/openmp/libomptarget/deviceRTLs/common/debug.h b/openmp/libomptarget/deviceRTLs/common/debug.h index 6539b7ad70cf6..b19f1bf3563d0 100644 --- a/openmp/libomptarget/deviceRTLs/common/debug.h +++ b/openmp/libomptarget/deviceRTLs/common/debug.h @@ -1,287 +1,287 @@ -//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains debug macros to be used in the application. -// -// Usage guide -// -// PRINT0(flag, str) : if debug flag is on, print (no arguments) -// PRINT(flag, str, args) : if debug flag is on, print (arguments) -// DON(flag) : return true if debug flag is on -// -// ASSERT(flag, cond, str, args): if test flag is on, test the condition -// if the condition is false, print str+args -// and assert. -// CAUTION: cond may be evaluate twice -// AON(flag) : return true if test flag is on -// -// WARNING(flag, str, args) : if warning flag is on, print the warning -// WON(flag) : return true if warning flag is on -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_NVPTX_DEBUG_H_ -#define _OMPTARGET_NVPTX_DEBUG_H_ - -#include "common/device_environment.h" - -//////////////////////////////////////////////////////////////////////////////// -// set desired level of debugging -//////////////////////////////////////////////////////////////////////////////// - -#define LD_SET_NONE 0ULL /* none */ -#define LD_SET_ALL -1ULL /* all */ - -// pos 1 -#define LD_SET_LOOP 0x1ULL /* basic loop */ -#define LD_SET_LOOPD 0x2ULL /* basic loop */ -#define LD_SET_PAR 0x4ULL /* basic parallel */ -#define LD_SET_PARD 0x8ULL /* basic parallel */ - -// pos 2 -#define LD_SET_SYNC 0x10ULL /* sync info */ -#define LD_SET_SYNCD 0x20ULL /* sync info */ -#define LD_SET_WAIT 0x40ULL /* state when waiting */ -#define LD_SET_TASK 0x80ULL /* print task info (high level) */ - -// pos 3 -#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */ -#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */ -#define LD_SET_ENV 0x400ULL /* env info */ -#define LD_SET_CANCEL 0x800ULL /* print cancel info */ - -// pos 4 -#define LD_SET_MEM 0x1000ULL /* malloc / free */ - -//////////////////////////////////////////////////////////////////////////////// -// set the desired flags to print selected output. - -// these are some examples of possible definitions that can be used for -// debugging. -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL) -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save -// on cuda buffer -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO) -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV) -//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR) - -#ifndef OMPTARGET_NVPTX_DEBUG -#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE -#elif OMPTARGET_NVPTX_DEBUG -#warning debug is used, not good for measurements -#endif - -//////////////////////////////////////////////////////////////////////////////// -// set desired level of asserts -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// available flags - -#define LT_SET_NONE 0x0 /* unsafe */ -#define LT_SET_SAFETY \ - 0x1 /* check malloc type of stuff, input at creation, cheap */ -#define LT_SET_INPUT 0x2 /* check also all runtime inputs */ -#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */ - -//////////////////////////////////////////////////////////////////////////////// -// set the desired flags - -#ifndef OMPTARGET_NVPTX_TEST -#if OMPTARGET_NVPTX_DEBUG -#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY) -#else -#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY) -#endif -#endif - -//////////////////////////////////////////////////////////////////////////////// -// set desired level of warnings -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// available flags - -#define LW_SET_ALL -1 -#define LW_SET_NONE 0x0 -#define LW_SET_ENV 0x1 -#define LW_SET_INPUT 0x2 -#define LW_SET_FUSSY 0x4 - -//////////////////////////////////////////////////////////////////////////////// -// set the desired flags - -#if OMPTARGET_NVPTX_DEBUG -#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE) -#else -#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY) -#endif - -//////////////////////////////////////////////////////////////////////////////// -// implementation for debug -//////////////////////////////////////////////////////////////////////////////// - -#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING -#include "common/support.h" - -template -NOINLINE static void log(const char *fmt, Arguments... parameters) { - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); -} - -#endif -#if OMPTARGET_NVPTX_TEST - -template -NOINLINE static void check(bool cond, const char *fmt, - Arguments... parameters) { - if (!cond) - printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), - (int)GetWarpId(), (int)GetLaneId(), parameters...); - assert(cond); -} - -NOINLINE static void check(bool cond) { assert(cond); } -#endif - -// set flags that are tested (inclusion properties) - -#define LD_ALL (LD_SET_ALL) - -#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD) -#define LD_LOOPD (LD_SET_LOOPD) -#define LD_PAR (LD_SET_PAR | LD_SET_PARD) -#define LD_PARD (LD_SET_PARD) - -// pos 2 -#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD) -#define LD_SYNCD (LD_SET_SYNCD) -#define LD_WAIT (LD_SET_WAIT) -#define LD_TASK (LD_SET_TASK) - -// pos 3 -#define LD_IO (LD_SET_IO | LD_SET_IOD) -#define LD_IOD (LD_SET_IOD) -#define LD_ENV (LD_SET_ENV) -#define LD_CANCEL (LD_SET_CANCEL) - -// pos 3 -#define LD_MEM (LD_SET_MEM) - -// implement -#if OMPTARGET_NVPTX_DEBUG - -#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag)) - -#define PRINT0(_flag, _str) \ - { \ - if (omptarget_device_environment.debug_level && DON(_flag)) { \ - log(": " _str); \ - } \ - } - -#define PRINT(_flag, _str, _args...) \ - { \ - if (omptarget_device_environment.debug_level && DON(_flag)) { \ - log(": " _str, _args); \ - } \ - } -#else - -#define DON(_flag) (0) -#define PRINT0(flag, str) -#define PRINT(flag, str, _args...) - -#endif - -// for printing without worrying about precision, pointers... -#define P64(_x) ((unsigned long long)(_x)) - -//////////////////////////////////////////////////////////////////////////////// -// early defs for test -//////////////////////////////////////////////////////////////////////////////// - -#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY) -#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY) -#define LT_FUSSY (LT_SET_FUSSY) - -#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY - -#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) -#define ASSERT0(_flag, _cond, _str) \ - { \ - if (TON(_flag)) { \ - check(_cond); \ - } \ - } -#define ASSERT(_flag, _cond, _str, _args...) \ - { \ - if (TON(_flag)) { \ - check(_cond); \ - } \ - } - -#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT - -#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) -#define ASSERT0(_flag, _cond, _str) \ - { \ - if (TON(_flag)) { \ - check((_cond), " ASSERT: " _str "\n"); \ - } \ - } -#define ASSERT(_flag, _cond, _str, _args...) \ - { \ - if (TON(_flag)) { \ - check((_cond), " ASSERT: " _str "\n", \ - _args); \ - } \ - } - -#else - -#define TON(_flag) (0) -#define ASSERT0(_flag, _cond, _str) -#define ASSERT(_flag, _cond, _str, _args...) - -#endif - -//////////////////////////////////////////////////////////////////////////////// -// early defs for warning - -#define LW_ALL (LW_SET_ALL) -#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV) -#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT) -#define LW_FUSSY (LW_SET_FUSSY) - -#if OMPTARGET_NVPTX_WARNING - -#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag)) -#define WARNING0(_flag, _str) \ - { \ - if (WON(_flag)) { \ - log(" WARNING: " _str); \ - } \ - } -#define WARNING(_flag, _str, _args...) \ - { \ - if (WON(_flag)) { \ - log(" WARNING: " _str, _args); \ - } \ - } - -#else - -#define WON(_flag) (0) -#define WARNING0(_flag, _str) -#define WARNING(_flag, _str, _args...) - -#endif - -#endif +//===------------- debug.h - NVPTX OpenMP debug macros ----------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains debug macros to be used in the application. +// +// Usage guide +// +// PRINT0(flag, str) : if debug flag is on, print (no arguments) +// PRINT(flag, str, args) : if debug flag is on, print (arguments) +// DON(flag) : return true if debug flag is on +// +// ASSERT(flag, cond, str, args): if test flag is on, test the condition +// if the condition is false, print str+args +// and assert. +// CAUTION: cond may be evaluate twice +// AON(flag) : return true if test flag is on +// +// WARNING(flag, str, args) : if warning flag is on, print the warning +// WON(flag) : return true if warning flag is on +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_NVPTX_DEBUG_H_ +#define _OMPTARGET_NVPTX_DEBUG_H_ + +#include "common/device_environment.h" + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of debugging +//////////////////////////////////////////////////////////////////////////////// + +#define LD_SET_NONE 0ULL /* none */ +#define LD_SET_ALL -1ULL /* all */ + +// pos 1 +#define LD_SET_LOOP 0x1ULL /* basic loop */ +#define LD_SET_LOOPD 0x2ULL /* basic loop */ +#define LD_SET_PAR 0x4ULL /* basic parallel */ +#define LD_SET_PARD 0x8ULL /* basic parallel */ + +// pos 2 +#define LD_SET_SYNC 0x10ULL /* sync info */ +#define LD_SET_SYNCD 0x20ULL /* sync info */ +#define LD_SET_WAIT 0x40ULL /* state when waiting */ +#define LD_SET_TASK 0x80ULL /* print task info (high level) */ + +// pos 3 +#define LD_SET_IO 0x100ULL /* big region io (excl atomic) */ +#define LD_SET_IOD 0x200ULL /* big region io (excl atomic) */ +#define LD_SET_ENV 0x400ULL /* env info */ +#define LD_SET_CANCEL 0x800ULL /* print cancel info */ + +// pos 4 +#define LD_SET_MEM 0x1000ULL /* malloc / free */ + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags to print selected output. + +// these are some examples of possible definitions that can be used for +// debugging. +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_ALL) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_LOOP) // limit to loop printfs to save +// on cuda buffer +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_IO | LD_SET_ENV) +//#define OMPTARGET_NVPTX_DEBUG (LD_SET_PAR) + +#ifndef OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_DEBUG LD_SET_NONE +#elif OMPTARGET_NVPTX_DEBUG +#warning debug is used, not good for measurements +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of asserts +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LT_SET_NONE 0x0 /* unsafe */ +#define LT_SET_SAFETY \ + 0x1 /* check malloc type of stuff, input at creation, cheap */ +#define LT_SET_INPUT 0x2 /* check also all runtime inputs */ +#define LT_SET_FUSSY 0x4 /* fussy checks, expensive */ + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#ifndef OMPTARGET_NVPTX_TEST +#if OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_TEST (LT_SET_FUSSY) +#else +#define OMPTARGET_NVPTX_TEST (LT_SET_SAFETY) +#endif +#endif + +//////////////////////////////////////////////////////////////////////////////// +// set desired level of warnings +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// available flags + +#define LW_SET_ALL -1 +#define LW_SET_NONE 0x0 +#define LW_SET_ENV 0x1 +#define LW_SET_INPUT 0x2 +#define LW_SET_FUSSY 0x4 + +//////////////////////////////////////////////////////////////////////////////// +// set the desired flags + +#if OMPTARGET_NVPTX_DEBUG +#define OMPTARGET_NVPTX_WARNING (LW_SET_NONE) +#else +#define OMPTARGET_NVPTX_WARNING (LW_SET_FUSSY) +#endif + +//////////////////////////////////////////////////////////////////////////////// +// implementation for debug +//////////////////////////////////////////////////////////////////////////////// + +#if OMPTARGET_NVPTX_DEBUG || OMPTARGET_NVPTX_TEST || OMPTARGET_NVPTX_WARNING +#include "common/support.h" + +template +NOINLINE static void log(const char *fmt, Arguments... parameters) { + printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), + (int)GetWarpId(), (int)GetLaneId(), parameters...); +} + +#endif +#if OMPTARGET_NVPTX_TEST + +template +NOINLINE static void check(bool cond, const char *fmt, + Arguments... parameters) { + if (!cond) + printf(fmt, (int)GetBlockIdInKernel(), (int)GetThreadIdInBlock(), + (int)GetWarpId(), (int)GetLaneId(), parameters...); + assert(cond); +} + +NOINLINE static void check(bool cond) { assert(cond); } +#endif + +// set flags that are tested (inclusion properties) + +#define LD_ALL (LD_SET_ALL) + +#define LD_LOOP (LD_SET_LOOP | LD_SET_LOOPD) +#define LD_LOOPD (LD_SET_LOOPD) +#define LD_PAR (LD_SET_PAR | LD_SET_PARD) +#define LD_PARD (LD_SET_PARD) + +// pos 2 +#define LD_SYNC (LD_SET_SYNC | LD_SET_SYNCD) +#define LD_SYNCD (LD_SET_SYNCD) +#define LD_WAIT (LD_SET_WAIT) +#define LD_TASK (LD_SET_TASK) + +// pos 3 +#define LD_IO (LD_SET_IO | LD_SET_IOD) +#define LD_IOD (LD_SET_IOD) +#define LD_ENV (LD_SET_ENV) +#define LD_CANCEL (LD_SET_CANCEL) + +// pos 3 +#define LD_MEM (LD_SET_MEM) + +// implement +#if OMPTARGET_NVPTX_DEBUG + +#define DON(_flag) ((unsigned)(OMPTARGET_NVPTX_DEBUG) & (_flag)) + +#define PRINT0(_flag, _str) \ + { \ + if (omptarget_device_environment.debug_level && DON(_flag)) { \ + log(": " _str); \ + } \ + } + +#define PRINT(_flag, _str, _args...) \ + { \ + if (omptarget_device_environment.debug_level && DON(_flag)) { \ + log(": " _str, _args); \ + } \ + } +#else + +#define DON(_flag) (0) +#define PRINT0(flag, str) +#define PRINT(flag, str, _args...) + +#endif + +// for printing without worrying about precision, pointers... +#define P64(_x) ((unsigned long long)(_x)) + +//////////////////////////////////////////////////////////////////////////////// +// early defs for test +//////////////////////////////////////////////////////////////////////////////// + +#define LT_SAFETY (LT_SET_SAFETY | LT_SET_INPUT | LT_SET_FUSSY) +#define LT_INPUT (LT_SET_INPUT | LT_SET_FUSSY) +#define LT_FUSSY (LT_SET_FUSSY) + +#if OMPTARGET_NVPTX_TEST == LT_SET_SAFETY + +#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) +#define ASSERT0(_flag, _cond, _str) \ + { \ + if (TON(_flag)) { \ + check(_cond); \ + } \ + } +#define ASSERT(_flag, _cond, _str, _args...) \ + { \ + if (TON(_flag)) { \ + check(_cond); \ + } \ + } + +#elif OMPTARGET_NVPTX_TEST >= LT_SET_INPUT + +#define TON(_flag) ((OMPTARGET_NVPTX_TEST) & (_flag)) +#define ASSERT0(_flag, _cond, _str) \ + { \ + if (TON(_flag)) { \ + check((_cond), " ASSERT: " _str "\n"); \ + } \ + } +#define ASSERT(_flag, _cond, _str, _args...) \ + { \ + if (TON(_flag)) { \ + check((_cond), " ASSERT: " _str "\n", \ + _args); \ + } \ + } + +#else + +#define TON(_flag) (0) +#define ASSERT0(_flag, _cond, _str) +#define ASSERT(_flag, _cond, _str, _args...) + +#endif + +//////////////////////////////////////////////////////////////////////////////// +// early defs for warning + +#define LW_ALL (LW_SET_ALL) +#define LW_ENV (LW_SET_FUSSY | LW_SET_INPUT | LW_SET_ENV) +#define LW_INPUT (LW_SET_FUSSY | LW_SET_INPUT) +#define LW_FUSSY (LW_SET_FUSSY) + +#if OMPTARGET_NVPTX_WARNING + +#define WON(_flag) ((OMPTARGET_NVPTX_WARNING) & (_flag)) +#define WARNING0(_flag, _str) \ + { \ + if (WON(_flag)) { \ + log(" WARNING: " _str); \ + } \ + } +#define WARNING(_flag, _str, _args...) \ + { \ + if (WON(_flag)) { \ + log(" WARNING: " _str, _args); \ + } \ + } + +#else + +#define WON(_flag) (0) +#define WARNING0(_flag, _str) +#define WARNING(_flag, _str, _args...) + +#endif + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/device_environment.h b/openmp/libomptarget/deviceRTLs/common/device_environment.h index 68a7757d20472..a13454514055d 100644 --- a/openmp/libomptarget/deviceRTLs/common/device_environment.h +++ b/openmp/libomptarget/deviceRTLs/common/device_environment.h @@ -1,24 +1,24 @@ -//===---- device_environment.h - OpenMP GPU device environment --- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Global device environment -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_ -#define _OMPTARGET_DEVICE_ENVIRONMENT_H_ - -#include "target_impl.h" - -struct omptarget_device_environmentTy { - int32_t debug_level; -}; - -extern DEVICE omptarget_device_environmentTy omptarget_device_environment; - -#endif +//===---- device_environment.h - OpenMP GPU device environment --- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Global device environment +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_DEVICE_ENVIRONMENT_H_ +#define _OMPTARGET_DEVICE_ENVIRONMENT_H_ + +#include "target_impl.h" + +struct omptarget_device_environmentTy { + int32_t debug_level; +}; + +extern DEVICE omptarget_device_environmentTy omptarget_device_environment; + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptarget.h b/openmp/libomptarget/deviceRTLs/common/omptarget.h index 986eb3677dcf4..d8e610d34cd0d 100644 --- a/openmp/libomptarget/deviceRTLs/common/omptarget.h +++ b/openmp/libomptarget/deviceRTLs/common/omptarget.h @@ -1,382 +1,382 @@ -//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the declarations of all library macros, types, -// and functions. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_H -#define OMPTARGET_H - -#include "target_impl.h" -#include "common/debug.h" // debug -#include "interface.h" // interfaces with omp, compiler, and user -#include "common/state-queue.h" -#include "common/support.h" - -#define OMPTARGET_NVPTX_VERSION 1.1 - -// used by the library for the interface with the app -#define DISPATCH_FINISHED 0 -#define DISPATCH_NOTFINISHED 1 - -// used by dynamic scheduling -#define FINISHED 0 -#define NOT_FINISHED 1 -#define LAST_CHUNK 2 - -#define BARRIER_COUNTER 0 -#define ORDERED_COUNTER 1 - -// arguments needed for L0 parallelism only. -class omptarget_nvptx_SharedArgs { -public: - // All these methods must be called by the master thread only. - INLINE void Init() { - args = buffer; - nArgs = MAX_SHARED_ARGS; - } - INLINE void DeInit() { - // Free any memory allocated for outlined parallel function with a large - // number of arguments. - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, "new extended args"); - Init(); - } - } - INLINE void EnsureSize(size_t size) { - if (size > nArgs) { - if (nArgs > MAX_SHARED_ARGS) { - SafeFree(args, "new extended args"); - } - args = (void **)SafeMalloc(size * sizeof(void *), "new extended args"); - nArgs = size; - } - } - // Called by all threads. - INLINE void **GetArgs() const { return args; }; -private: - // buffer of pre-allocated arguments. - void *buffer[MAX_SHARED_ARGS]; - // pointer to arguments buffer. - // starts off as a pointer to 'buffer' but can be dynamically allocated. - void **args; - // starts off as MAX_SHARED_ARGS but can increase in size. - uint32_t nArgs; -}; - -extern DEVICE SHARED omptarget_nvptx_SharedArgs - omptarget_nvptx_globalArgs; - -// Data structure to keep in shared memory that traces the current slot, stack, -// and frame pointer as well as the active threads that didn't exit the current -// environment. -struct DataSharingStateTy { - __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; - void *StackPtr[DS_Max_Warp_Number]; - void * volatile FramePtr[DS_Max_Warp_Number]; - __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; -}; -// Additional worker slot type which is initialized with the default worker slot -// size of 4*32 bytes. -struct __kmpc_data_sharing_worker_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Worker_Warp_Slot_Size]; -}; -// Additional master slot type which is initialized with the default master slot -// size of 4 bytes. -struct __kmpc_data_sharing_master_slot_static { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[DS_Slot_Size]; -}; -extern DEVICE SHARED DataSharingStateTy DataSharingState; - -//////////////////////////////////////////////////////////////////////////////// -// task ICV and (implicit & explicit) task state - -class omptarget_nvptx_TaskDescr { -public: - // methods for flags - INLINE omp_sched_t GetRuntimeSched() const; - INLINE void SetRuntimeSched(omp_sched_t sched); - INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } - INLINE int InL2OrHigherParallelRegion() const { - return items.flags & TaskDescr_InParL2P; - } - INLINE int IsParallelConstruct() const { - return items.flags & TaskDescr_IsParConstr; - } - INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } - // methods for other fields - INLINE uint16_t &ThreadId() { return items.threadId; } - INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } - INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } - INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { - prev = taskDescr; - } - // init & copy - INLINE void InitLevelZeroTaskDescr(); - INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); - INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); - INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); - INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); - INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, - uint16_t tid, uint16_t tnum); - INLINE void SaveLoopData(); - INLINE void RestoreLoopData() const; - -private: - // bits for flags: (6 used, 2 free) - // 3 bits (SchedMask) for runtime schedule - // 1 bit (InPar) if this thread has encountered one or more parallel region - // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) - // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel - // region - static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); - static const uint8_t TaskDescr_InPar = 0x10; - static const uint8_t TaskDescr_IsParConstr = 0x20; - static const uint8_t TaskDescr_InParL2P = 0x40; - - struct SavedLoopDescr_items { - int64_t loopUpperBound; - int64_t nextLowerBound; - int64_t chunk; - int64_t stride; - kmp_sched_t schedule; - } loopData; - - struct TaskDescr_items { - uint8_t flags; // 6 bit used (see flag above) - uint8_t unused; - uint16_t threadId; // thread id - uint64_t runtimeChunkSize; // runtime chunk size - } items; - omptarget_nvptx_TaskDescr *prev; -}; - -// build on kmp -typedef struct omptarget_nvptx_ExplicitTaskDescr { - omptarget_nvptx_TaskDescr - taskDescr; // omptarget_nvptx task description (must be first) - kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) -} omptarget_nvptx_ExplicitTaskDescr; - -//////////////////////////////////////////////////////////////////////////////// -// Descriptor of a parallel region (worksharing in general) - -class omptarget_nvptx_WorkDescr { - -public: - // access to data - INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } - -private: - omptarget_nvptx_TaskDescr masterTaskICV; -}; - -//////////////////////////////////////////////////////////////////////////////// - -class omptarget_nvptx_TeamDescr { -public: - // access to data - INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { - return &levelZeroTaskDescr; - } - INLINE omptarget_nvptx_WorkDescr &WorkDescr() { - return workDescrForActiveParallel; - } - INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } - - // init - INLINE void InitTeamDescr(); - - INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { - // If this is invoked by the master thread of the master warp then - // initialize it with a smaller slot. - if (IsMasterThread) { - // Do not initialize this slot again if it has already been initalized. - if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) - return 0; - // Initialize the pointer to the end of the slot given the size of the - // data section. DataEnd is non-inclusive. - master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; - // We currently do not have a next slot. - master_rootS[0].Next = 0; - master_rootS[0].Prev = 0; - master_rootS[0].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&master_rootS[0]; - } - // Do not initialize this slot again if it has already been initalized. - if (worker_rootS[wid].DataEnd == - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) - return 0; - // Initialize the pointer to the end of the slot given the size of the data - // section. DataEnd is non-inclusive. - worker_rootS[wid].DataEnd = - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; - // We currently do not have a next slot. - worker_rootS[wid].Next = 0; - worker_rootS[wid].Prev = 0; - worker_rootS[wid].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; - } - - INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { - worker_rootS[wid].DataEnd = - &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; - // We currently do not have a next slot. - worker_rootS[wid].Next = 0; - worker_rootS[wid].Prev = 0; - worker_rootS[wid].PrevSlotStackPtr = 0; - return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; - } - -private: - omptarget_nvptx_TaskDescr - levelZeroTaskDescr; // icv for team master initial thread - omptarget_nvptx_WorkDescr - workDescrForActiveParallel; // one, ONLY for the active par - uint64_t lastprivateIterBuffer; - - ALIGN(16) - __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; - ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; -}; - -//////////////////////////////////////////////////////////////////////////////// -// thread private data (struct of arrays for better coalescing) -// tid refers here to the global thread id -// do not support multiple concurrent kernel a this time -class omptarget_nvptx_ThreadPrivateContext { -public: - // task - INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { - return &levelOneTaskDescr[tid]; - } - INLINE void SetTopLevelTaskDescr(int tid, - omptarget_nvptx_TaskDescr *taskICV) { - topTaskDescr[tid] = taskICV; - } - INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; - // parallel - INLINE uint16_t &NumThreadsForNextParallel(int tid) { - return nextRegion.tnum[tid]; - } - // simd - INLINE uint16_t &SimdLimitForNextSimd(int tid) { - return nextRegion.slim[tid]; - } - // schedule (for dispatch) - INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } - INLINE int64_t &Chunk(int tid) { return chunk[tid]; } - INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } - INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } - INLINE int64_t &Stride(int tid) { return stride[tid]; } - - INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } - - INLINE void InitThreadPrivateContext(int tid); - INLINE uint64_t &Cnt() { return cnt; } - -private: - // team context for this team - omptarget_nvptx_TeamDescr teamContext; - // task ICV for implicit threads in the only parallel region - omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; - // pointer where to find the current task ICV (top of the stack) - omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; - union { - // Only one of the two is live at the same time. - // parallel - uint16_t tnum[MAX_THREADS_PER_TEAM]; - // simd limit - uint16_t slim[MAX_THREADS_PER_TEAM]; - } nextRegion; - // schedule (for dispatch) - kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for - int64_t chunk[MAX_THREADS_PER_TEAM]; - int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; - // state for dispatch with dyn/guided OR static (never use both at a time) - int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; - int64_t stride[MAX_THREADS_PER_TEAM]; - uint64_t cnt; -}; - -/// Memory manager for statically allocated memory. -class omptarget_nvptx_SimpleMemoryManager { -private: - ALIGN(128) struct MemDataTy { - volatile unsigned keys[OMP_STATE_COUNT]; - } MemData[MAX_SM]; - - INLINE static uint32_t hash(unsigned key) { - return key & (OMP_STATE_COUNT - 1); - } - -public: - INLINE void Release(); - INLINE const void *Acquire(const void *buf, size_t size); -}; - -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// global data tables -//////////////////////////////////////////////////////////////////////////////// - -extern DEVICE omptarget_nvptx_SimpleMemoryManager - omptarget_nvptx_simpleMemoryManager; -extern DEVICE SHARED uint32_t usedMemIdx; -extern DEVICE SHARED uint32_t usedSlotIdx; -extern DEVICE SHARED uint8_t - parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -extern DEVICE SHARED uint16_t threadLimit; -extern DEVICE SHARED uint16_t threadsInTeam; -extern DEVICE SHARED uint16_t nThreads; -extern DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - -extern DEVICE SHARED uint32_t execution_param; -extern DEVICE SHARED void *ReductionScratchpadPtr; - -//////////////////////////////////////////////////////////////////////////////// -// work function (outlined parallel/simd functions) and arguments. -// needed for L1 parallelism only. -//////////////////////////////////////////////////////////////////////////////// - -typedef void *omptarget_nvptx_WorkFn; -extern volatile DEVICE SHARED omptarget_nvptx_WorkFn - omptarget_nvptx_workFn; - -//////////////////////////////////////////////////////////////////////////////// -// get private data structures -//////////////////////////////////////////////////////////////////////////////// - -INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); -INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); -INLINE omptarget_nvptx_TaskDescr * -getMyTopTaskDescriptor(bool isSPMDExecutionMode); -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); - -//////////////////////////////////////////////////////////////////////////////// -// inlined implementation -//////////////////////////////////////////////////////////////////////////////// - -#include "common/omptargeti.h" - -#endif +//===---- omptarget.h - OpenMP GPU initialization ---------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_H +#define OMPTARGET_H + +#include "target_impl.h" +#include "common/debug.h" // debug +#include "interface.h" // interfaces with omp, compiler, and user +#include "common/state-queue.h" +#include "common/support.h" + +#define OMPTARGET_NVPTX_VERSION 1.1 + +// used by the library for the interface with the app +#define DISPATCH_FINISHED 0 +#define DISPATCH_NOTFINISHED 1 + +// used by dynamic scheduling +#define FINISHED 0 +#define NOT_FINISHED 1 +#define LAST_CHUNK 2 + +#define BARRIER_COUNTER 0 +#define ORDERED_COUNTER 1 + +// arguments needed for L0 parallelism only. +class omptarget_nvptx_SharedArgs { +public: + // All these methods must be called by the master thread only. + INLINE void Init() { + args = buffer; + nArgs = MAX_SHARED_ARGS; + } + INLINE void DeInit() { + // Free any memory allocated for outlined parallel function with a large + // number of arguments. + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, "new extended args"); + Init(); + } + } + INLINE void EnsureSize(size_t size) { + if (size > nArgs) { + if (nArgs > MAX_SHARED_ARGS) { + SafeFree(args, "new extended args"); + } + args = (void **)SafeMalloc(size * sizeof(void *), "new extended args"); + nArgs = size; + } + } + // Called by all threads. + INLINE void **GetArgs() const { return args; }; +private: + // buffer of pre-allocated arguments. + void *buffer[MAX_SHARED_ARGS]; + // pointer to arguments buffer. + // starts off as a pointer to 'buffer' but can be dynamically allocated. + void **args; + // starts off as MAX_SHARED_ARGS but can increase in size. + uint32_t nArgs; +}; + +extern DEVICE SHARED omptarget_nvptx_SharedArgs + omptarget_nvptx_globalArgs; + +// Data structure to keep in shared memory that traces the current slot, stack, +// and frame pointer as well as the active threads that didn't exit the current +// environment. +struct DataSharingStateTy { + __kmpc_data_sharing_slot *SlotPtr[DS_Max_Warp_Number]; + void *StackPtr[DS_Max_Warp_Number]; + void * volatile FramePtr[DS_Max_Warp_Number]; + __kmpc_impl_lanemask_t ActiveThreads[DS_Max_Warp_Number]; +}; +// Additional worker slot type which is initialized with the default worker slot +// size of 4*32 bytes. +struct __kmpc_data_sharing_worker_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Worker_Warp_Slot_Size]; +}; +// Additional master slot type which is initialized with the default master slot +// size of 4 bytes. +struct __kmpc_data_sharing_master_slot_static { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[DS_Slot_Size]; +}; +extern DEVICE SHARED DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// task ICV and (implicit & explicit) task state + +class omptarget_nvptx_TaskDescr { +public: + // methods for flags + INLINE omp_sched_t GetRuntimeSched() const; + INLINE void SetRuntimeSched(omp_sched_t sched); + INLINE int InParallelRegion() const { return items.flags & TaskDescr_InPar; } + INLINE int InL2OrHigherParallelRegion() const { + return items.flags & TaskDescr_InParL2P; + } + INLINE int IsParallelConstruct() const { + return items.flags & TaskDescr_IsParConstr; + } + INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); } + // methods for other fields + INLINE uint16_t &ThreadId() { return items.threadId; } + INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; } + INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; } + INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) { + prev = taskDescr; + } + // init & copy + INLINE void InitLevelZeroTaskDescr(); + INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr); + INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr); + INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr); + INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr); + INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr, + uint16_t tid, uint16_t tnum); + INLINE void SaveLoopData(); + INLINE void RestoreLoopData() const; + +private: + // bits for flags: (6 used, 2 free) + // 3 bits (SchedMask) for runtime schedule + // 1 bit (InPar) if this thread has encountered one or more parallel region + // 1 bit (IsParConstr) if ICV for a parallel region (false = explicit task) + // 1 bit (InParL2+) if this thread has encountered L2 or higher parallel + // region + static const uint8_t TaskDescr_SchedMask = (0x1 | 0x2 | 0x4); + static const uint8_t TaskDescr_InPar = 0x10; + static const uint8_t TaskDescr_IsParConstr = 0x20; + static const uint8_t TaskDescr_InParL2P = 0x40; + + struct SavedLoopDescr_items { + int64_t loopUpperBound; + int64_t nextLowerBound; + int64_t chunk; + int64_t stride; + kmp_sched_t schedule; + } loopData; + + struct TaskDescr_items { + uint8_t flags; // 6 bit used (see flag above) + uint8_t unused; + uint16_t threadId; // thread id + uint64_t runtimeChunkSize; // runtime chunk size + } items; + omptarget_nvptx_TaskDescr *prev; +}; + +// build on kmp +typedef struct omptarget_nvptx_ExplicitTaskDescr { + omptarget_nvptx_TaskDescr + taskDescr; // omptarget_nvptx task description (must be first) + kmp_TaskDescr kmpTaskDescr; // kmp task description (must be last) +} omptarget_nvptx_ExplicitTaskDescr; + +//////////////////////////////////////////////////////////////////////////////// +// Descriptor of a parallel region (worksharing in general) + +class omptarget_nvptx_WorkDescr { + +public: + // access to data + INLINE omptarget_nvptx_TaskDescr *WorkTaskDescr() { return &masterTaskICV; } + +private: + omptarget_nvptx_TaskDescr masterTaskICV; +}; + +//////////////////////////////////////////////////////////////////////////////// + +class omptarget_nvptx_TeamDescr { +public: + // access to data + INLINE omptarget_nvptx_TaskDescr *LevelZeroTaskDescr() { + return &levelZeroTaskDescr; + } + INLINE omptarget_nvptx_WorkDescr &WorkDescr() { + return workDescrForActiveParallel; + } + INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; } + + // init + INLINE void InitTeamDescr(); + + INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) { + // If this is invoked by the master thread of the master warp then + // initialize it with a smaller slot. + if (IsMasterThread) { + // Do not initialize this slot again if it has already been initalized. + if (master_rootS[0].DataEnd == &master_rootS[0].Data[0] + DS_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the + // data section. DataEnd is non-inclusive. + master_rootS[0].DataEnd = &master_rootS[0].Data[0] + DS_Slot_Size; + // We currently do not have a next slot. + master_rootS[0].Next = 0; + master_rootS[0].Prev = 0; + master_rootS[0].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&master_rootS[0]; + } + // Do not initialize this slot again if it has already been initalized. + if (worker_rootS[wid].DataEnd == + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size) + return 0; + // Initialize the pointer to the end of the slot given the size of the data + // section. DataEnd is non-inclusive. + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + + INLINE __kmpc_data_sharing_slot *GetPreallocatedSlotAddr(int wid) { + worker_rootS[wid].DataEnd = + &worker_rootS[wid].Data[0] + DS_Worker_Warp_Slot_Size; + // We currently do not have a next slot. + worker_rootS[wid].Next = 0; + worker_rootS[wid].Prev = 0; + worker_rootS[wid].PrevSlotStackPtr = 0; + return (__kmpc_data_sharing_slot *)&worker_rootS[wid]; + } + +private: + omptarget_nvptx_TaskDescr + levelZeroTaskDescr; // icv for team master initial thread + omptarget_nvptx_WorkDescr + workDescrForActiveParallel; // one, ONLY for the active par + uint64_t lastprivateIterBuffer; + + ALIGN(16) + __kmpc_data_sharing_worker_slot_static worker_rootS[WARPSIZE]; + ALIGN(16) __kmpc_data_sharing_master_slot_static master_rootS[1]; +}; + +//////////////////////////////////////////////////////////////////////////////// +// thread private data (struct of arrays for better coalescing) +// tid refers here to the global thread id +// do not support multiple concurrent kernel a this time +class omptarget_nvptx_ThreadPrivateContext { +public: + // task + INLINE omptarget_nvptx_TaskDescr *Level1TaskDescr(int tid) { + return &levelOneTaskDescr[tid]; + } + INLINE void SetTopLevelTaskDescr(int tid, + omptarget_nvptx_TaskDescr *taskICV) { + topTaskDescr[tid] = taskICV; + } + INLINE omptarget_nvptx_TaskDescr *GetTopLevelTaskDescr(int tid) const; + // parallel + INLINE uint16_t &NumThreadsForNextParallel(int tid) { + return nextRegion.tnum[tid]; + } + // simd + INLINE uint16_t &SimdLimitForNextSimd(int tid) { + return nextRegion.slim[tid]; + } + // schedule (for dispatch) + INLINE kmp_sched_t &ScheduleType(int tid) { return schedule[tid]; } + INLINE int64_t &Chunk(int tid) { return chunk[tid]; } + INLINE int64_t &LoopUpperBound(int tid) { return loopUpperBound[tid]; } + INLINE int64_t &NextLowerBound(int tid) { return nextLowerBound[tid]; } + INLINE int64_t &Stride(int tid) { return stride[tid]; } + + INLINE omptarget_nvptx_TeamDescr &TeamContext() { return teamContext; } + + INLINE void InitThreadPrivateContext(int tid); + INLINE uint64_t &Cnt() { return cnt; } + +private: + // team context for this team + omptarget_nvptx_TeamDescr teamContext; + // task ICV for implicit threads in the only parallel region + omptarget_nvptx_TaskDescr levelOneTaskDescr[MAX_THREADS_PER_TEAM]; + // pointer where to find the current task ICV (top of the stack) + omptarget_nvptx_TaskDescr *topTaskDescr[MAX_THREADS_PER_TEAM]; + union { + // Only one of the two is live at the same time. + // parallel + uint16_t tnum[MAX_THREADS_PER_TEAM]; + // simd limit + uint16_t slim[MAX_THREADS_PER_TEAM]; + } nextRegion; + // schedule (for dispatch) + kmp_sched_t schedule[MAX_THREADS_PER_TEAM]; // remember schedule type for #for + int64_t chunk[MAX_THREADS_PER_TEAM]; + int64_t loopUpperBound[MAX_THREADS_PER_TEAM]; + // state for dispatch with dyn/guided OR static (never use both at a time) + int64_t nextLowerBound[MAX_THREADS_PER_TEAM]; + int64_t stride[MAX_THREADS_PER_TEAM]; + uint64_t cnt; +}; + +/// Memory manager for statically allocated memory. +class omptarget_nvptx_SimpleMemoryManager { +private: + ALIGN(128) struct MemDataTy { + volatile unsigned keys[OMP_STATE_COUNT]; + } MemData[MAX_SM]; + + INLINE static uint32_t hash(unsigned key) { + return key & (OMP_STATE_COUNT - 1); + } + +public: + INLINE void Release(); + INLINE const void *Acquire(const void *buf, size_t size); +}; + +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern DEVICE omptarget_nvptx_SimpleMemoryManager + omptarget_nvptx_simpleMemoryManager; +extern DEVICE SHARED uint32_t usedMemIdx; +extern DEVICE SHARED uint32_t usedSlotIdx; +extern DEVICE SHARED uint8_t + parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +extern DEVICE SHARED uint16_t threadLimit; +extern DEVICE SHARED uint16_t threadsInTeam; +extern DEVICE SHARED uint16_t nThreads; +extern DEVICE SHARED + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + +extern DEVICE SHARED uint32_t execution_param; +extern DEVICE SHARED void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// work function (outlined parallel/simd functions) and arguments. +// needed for L1 parallelism only. +//////////////////////////////////////////////////////////////////////////////// + +typedef void *omptarget_nvptx_WorkFn; +extern volatile DEVICE SHARED omptarget_nvptx_WorkFn + omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// get private data structures +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor(); +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor(); +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode); +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int globalThreadId); + +//////////////////////////////////////////////////////////////////////////////// +// inlined implementation +//////////////////////////////////////////////////////////////////////////////// + +#include "common/omptargeti.h" + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/omptargeti.h b/openmp/libomptarget/deviceRTLs/common/omptargeti.h index 14faa59062aee..e20016eeaa0da 100644 --- a/openmp/libomptarget/deviceRTLs/common/omptargeti.h +++ b/openmp/libomptarget/deviceRTLs/common/omptargeti.h @@ -1,228 +1,228 @@ -//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the declarations of all library macros, types, -// and functions. -// -//===----------------------------------------------------------------------===// - -#include "common/target_atomic.h" - -//////////////////////////////////////////////////////////////////////////////// -// Task Descriptor -//////////////////////////////////////////////////////////////////////////////// - -INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const { - // sched starts from 1..4; encode it as 0..3; so add 1 here - uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1; - return (omp_sched_t)rc; -} - -INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { - // sched starts from 1..4; encode it as 0..3; so sub 1 here - uint8_t val = ((uint8_t)sched) - 1; - // clear current sched - items.flags &= ~TaskDescr_SchedMask; - // set new sched - items.flags |= val; -} - -INLINE void -omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { - // slow method - // flag: - // default sched is static, - // dyn is off (unused now anyway, but may need to sample from host ?) - // not in parallel - - items.flags = 0; - items.threadId = 0; // is master - items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 -} - -// This is called when all threads are started together in SPMD mode. -// OMP directives include target parallel, target distribute parallel for, etc. -INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( - omptarget_nvptx_TaskDescr *parentTaskDescr) { - // slow method - // flag: - // default sched is static, - // dyn is off (unused now anyway, but may need to sample from host ?) - // in L1 parallel - - items.flags = - TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel - items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) - items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 - prev = parentTaskDescr; -} - -INLINE void omptarget_nvptx_TaskDescr::CopyData( - omptarget_nvptx_TaskDescr *sourceTaskDescr) { - items = sourceTaskDescr->items; -} - -INLINE void -omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) { - CopyData(sourceTaskDescr); - prev = sourceTaskDescr->prev; -} - -INLINE void omptarget_nvptx_TaskDescr::CopyParent( - omptarget_nvptx_TaskDescr *parentTaskDescr) { - CopyData(parentTaskDescr); - prev = parentTaskDescr; -} - -INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask( - omptarget_nvptx_TaskDescr *parentTaskDescr) { - CopyParent(parentTaskDescr); - items.flags = items.flags & ~TaskDescr_IsParConstr; - ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); -} - -INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( - omptarget_nvptx_TaskDescr *masterTaskDescr) { - CopyParent(masterTaskDescr); - // overwrite specific items; - items.flags |= - TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel -} - -INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( - omptarget_nvptx_TaskDescr *workTaskDescr) { - Copy(workTaskDescr); - // - // overwrite specific items; - // - // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). - // This is so that the serial master (first lane in the master warp) - // gets a threadId of 0. - // However, we know that this function is always called in a parallel - // region where only workers are active. The serial master thread - // never enters this region. When a parallel region is executed serially, - // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions - // are called, which never activate this region. - items.threadId = - GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) -} - -INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( - omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { - CopyParent(parentTaskDescr); - items.flags |= TaskDescr_InParL2P; // In L2+ parallelism - items.threadId = tid; -} - -INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { - loopData.loopUpperBound = - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); - loopData.nextLowerBound = - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); - loopData.schedule = - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); - loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); - loopData.stride = - omptarget_nvptx_threadPrivateContext->Stride(items.threadId); -} - -INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { - omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = - loopData.loopUpperBound; - omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = - loopData.nextLowerBound; - omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = - loopData.stride; - omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = - loopData.schedule; -} - -//////////////////////////////////////////////////////////////////////////////// -// Thread Private Context -//////////////////////////////////////////////////////////////////////////////// - -INLINE omptarget_nvptx_TaskDescr * -omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const { - ASSERT0( - LT_FUSSY, tid < MAX_THREADS_PER_TEAM, - "Getting top level, tid is larger than allocated data structure size"); - return topTaskDescr[tid]; -} - -INLINE void -omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) { - // levelOneTaskDescr is init when starting the parallel region - // top task descr is NULL (team master version will be fixed separately) - topTaskDescr[tid] = NULL; - // no num threads value has been pushed - nextRegion.tnum[tid] = 0; - // the following don't need to be init here; they are init when using dyn - // sched - // current_Event, events_Number, chunk, num_Iterations, schedule -} - -//////////////////////////////////////////////////////////////////////////////// -// Team Descriptor -//////////////////////////////////////////////////////////////////////////////// - -INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() { - levelZeroTaskDescr.InitLevelZeroTaskDescr(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Get private data structure for thread -//////////////////////////////////////////////////////////////////////////////// - -// Utility routines for CUDA threads -INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() { - return omptarget_nvptx_threadPrivateContext->TeamContext(); -} - -INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() { - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - return currTeamDescr.WorkDescr(); -} - -INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) { - return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); -} - -INLINE omptarget_nvptx_TaskDescr * -getMyTopTaskDescriptor(bool isSPMDExecutionMode) { - return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode)); -} - -//////////////////////////////////////////////////////////////////////////////// -// Memory management runtime functions. -//////////////////////////////////////////////////////////////////////////////// - -INLINE void omptarget_nvptx_SimpleMemoryManager::Release() { - ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, - "SlotIdx is too big or uninitialized."); - ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT, - "MemIdx is too big or uninitialized."); - MemDataTy &MD = MemData[usedSlotIdx]; - __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u); -} - -INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf, - size_t size) { - ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, - "SlotIdx is too big or uninitialized."); - const unsigned sm = usedSlotIdx; - MemDataTy &MD = MemData[sm]; - unsigned i = hash(GetBlockIdInKernel()); - while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) { - i = hash(i + 1); - } - usedSlotIdx = sm; - usedMemIdx = i; - return static_cast(buf) + (sm * OMP_STATE_COUNT + i) * size; -} +//===---- omptargeti.h - OpenMP GPU initialization --------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the declarations of all library macros, types, +// and functions. +// +//===----------------------------------------------------------------------===// + +#include "common/target_atomic.h" + +//////////////////////////////////////////////////////////////////////////////// +// Task Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE omp_sched_t omptarget_nvptx_TaskDescr::GetRuntimeSched() const { + // sched starts from 1..4; encode it as 0..3; so add 1 here + uint8_t rc = (items.flags & TaskDescr_SchedMask) + 1; + return (omp_sched_t)rc; +} + +INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) { + // sched starts from 1..4; encode it as 0..3; so sub 1 here + uint8_t val = ((uint8_t)sched) - 1; + // clear current sched + items.flags &= ~TaskDescr_SchedMask; + // set new sched + items.flags |= val; +} + +INLINE void +omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // not in parallel + + items.flags = 0; + items.threadId = 0; // is master + items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 +} + +// This is called when all threads are started together in SPMD mode. +// OMP directives include target parallel, target distribute parallel for, etc. +INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + // slow method + // flag: + // default sched is static, + // dyn is off (unused now anyway, but may need to sample from host ?) + // in L1 parallel + + items.flags = + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel + items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) + items.runtimeChunkSize = 1; // preferred chunking statik with chunk 1 + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyData( + omptarget_nvptx_TaskDescr *sourceTaskDescr) { + items = sourceTaskDescr->items; +} + +INLINE void +omptarget_nvptx_TaskDescr::Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr) { + CopyData(sourceTaskDescr); + prev = sourceTaskDescr->prev; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyParent( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyData(parentTaskDescr); + prev = parentTaskDescr; +} + +INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask( + omptarget_nvptx_TaskDescr *parentTaskDescr) { + CopyParent(parentTaskDescr); + items.flags = items.flags & ~TaskDescr_IsParConstr; + ASSERT0(LT_FUSSY, IsTaskConstruct(), "expected task"); +} + +INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr( + omptarget_nvptx_TaskDescr *masterTaskDescr) { + CopyParent(masterTaskDescr); + // overwrite specific items; + items.flags |= + TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel +} + +INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr( + omptarget_nvptx_TaskDescr *workTaskDescr) { + Copy(workTaskDescr); + // + // overwrite specific items; + // + // The threadID should be GetThreadIdInBlock() % GetMasterThreadID(). + // This is so that the serial master (first lane in the master warp) + // gets a threadId of 0. + // However, we know that this function is always called in a parallel + // region where only workers are active. The serial master thread + // never enters this region. When a parallel region is executed serially, + // the threadId is set to 0 elsewhere and the kmpc_serialized_* functions + // are called, which never activate this region. + items.threadId = + GetThreadIdInBlock(); // get ids from cuda (only called for 1st level) +} + +INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent( + omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) { + CopyParent(parentTaskDescr); + items.flags |= TaskDescr_InParL2P; // In L2+ parallelism + items.threadId = tid; +} + +INLINE void omptarget_nvptx_TaskDescr::SaveLoopData() { + loopData.loopUpperBound = + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId); + loopData.nextLowerBound = + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId); + loopData.schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId); + loopData.chunk = omptarget_nvptx_threadPrivateContext->Chunk(items.threadId); + loopData.stride = + omptarget_nvptx_threadPrivateContext->Stride(items.threadId); +} + +INLINE void omptarget_nvptx_TaskDescr::RestoreLoopData() const { + omptarget_nvptx_threadPrivateContext->Chunk(items.threadId) = loopData.chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(items.threadId) = + loopData.loopUpperBound; + omptarget_nvptx_threadPrivateContext->NextLowerBound(items.threadId) = + loopData.nextLowerBound; + omptarget_nvptx_threadPrivateContext->Stride(items.threadId) = + loopData.stride; + omptarget_nvptx_threadPrivateContext->ScheduleType(items.threadId) = + loopData.schedule; +} + +//////////////////////////////////////////////////////////////////////////////// +// Thread Private Context +//////////////////////////////////////////////////////////////////////////////// + +INLINE omptarget_nvptx_TaskDescr * +omptarget_nvptx_ThreadPrivateContext::GetTopLevelTaskDescr(int tid) const { + ASSERT0( + LT_FUSSY, tid < MAX_THREADS_PER_TEAM, + "Getting top level, tid is larger than allocated data structure size"); + return topTaskDescr[tid]; +} + +INLINE void +omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) { + // levelOneTaskDescr is init when starting the parallel region + // top task descr is NULL (team master version will be fixed separately) + topTaskDescr[tid] = NULL; + // no num threads value has been pushed + nextRegion.tnum[tid] = 0; + // the following don't need to be init here; they are init when using dyn + // sched + // current_Event, events_Number, chunk, num_Iterations, schedule +} + +//////////////////////////////////////////////////////////////////////////////// +// Team Descriptor +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() { + levelZeroTaskDescr.InitLevelZeroTaskDescr(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Get private data structure for thread +//////////////////////////////////////////////////////////////////////////////// + +// Utility routines for CUDA threads +INLINE omptarget_nvptx_TeamDescr &getMyTeamDescriptor() { + return omptarget_nvptx_threadPrivateContext->TeamContext(); +} + +INLINE omptarget_nvptx_WorkDescr &getMyWorkDescriptor() { + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + return currTeamDescr.WorkDescr(); +} + +INLINE omptarget_nvptx_TaskDescr *getMyTopTaskDescriptor(int threadId) { + return omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); +} + +INLINE omptarget_nvptx_TaskDescr * +getMyTopTaskDescriptor(bool isSPMDExecutionMode) { + return getMyTopTaskDescriptor(GetLogicalThreadIdInBlock(isSPMDExecutionMode)); +} + +//////////////////////////////////////////////////////////////////////////////// +// Memory management runtime functions. +//////////////////////////////////////////////////////////////////////////////// + +INLINE void omptarget_nvptx_SimpleMemoryManager::Release() { + ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, + "SlotIdx is too big or uninitialized."); + ASSERT0(LT_FUSSY, usedMemIdx < OMP_STATE_COUNT, + "MemIdx is too big or uninitialized."); + MemDataTy &MD = MemData[usedSlotIdx]; + __kmpc_atomic_exchange((unsigned *)&MD.keys[usedMemIdx], 0u); +} + +INLINE const void *omptarget_nvptx_SimpleMemoryManager::Acquire(const void *buf, + size_t size) { + ASSERT0(LT_FUSSY, usedSlotIdx < MAX_SM, + "SlotIdx is too big or uninitialized."); + const unsigned sm = usedSlotIdx; + MemDataTy &MD = MemData[sm]; + unsigned i = hash(GetBlockIdInKernel()); + while (__kmpc_atomic_cas((unsigned *)&MD.keys[i], 0u, 1u) != 0) { + i = hash(i + 1); + } + usedSlotIdx = sm; + usedMemIdx = i; + return static_cast(buf) + (sm * OMP_STATE_COUNT + i) * size; +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu index 9540f5647699b..4a1a13cce2c28 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/cancel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/cancel.cu @@ -1,28 +1,28 @@ -//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Interface to be used in the implementation of OpenMP cancel. -// -//===----------------------------------------------------------------------===// - -#include "interface.h" -#include "common/debug.h" - -EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal) { - PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); - // disabled - return 0; -} - -EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal) { - PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal); - // disabled - return 0; -} +//===------ cancel.cu - NVPTX OpenMP cancel interface ------------ CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Interface to be used in the implementation of OpenMP cancel. +// +//===----------------------------------------------------------------------===// + +#include "interface.h" +#include "common/debug.h" + +EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal) { + PRINT(LD_IO, "call kmpc_cancellationpoint(cancel val %d)\n", (int)cancelVal); + // disabled + return 0; +} + +EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal) { + PRINT(LD_IO, "call kmpc_cancel(cancel val %d)\n", (int)cancelVal); + // disabled + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/critical.cu b/openmp/libomptarget/deviceRTLs/common/src/critical.cu index ee4b056ddad92..08fc053c33ce0 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/critical.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/critical.cu @@ -1,28 +1,28 @@ -//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of critical with KMPC interface -// -//===----------------------------------------------------------------------===// - -#include "interface.h" -#include "common/debug.h" - -EXTERN -void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *lck) { - PRINT0(LD_IO, "call to kmpc_critical()\n"); - omp_set_lock((omp_lock_t *)lck); -} - -EXTERN -void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *lck) { - PRINT0(LD_IO, "call to kmpc_end_critical()\n"); - omp_unset_lock((omp_lock_t *)lck); -} +//===------ critical.cu - NVPTX OpenMP critical ------------------ CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of critical with KMPC interface +// +//===----------------------------------------------------------------------===// + +#include "interface.h" +#include "common/debug.h" + +EXTERN +void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *lck) { + PRINT0(LD_IO, "call to kmpc_critical()\n"); + omp_set_lock((omp_lock_t *)lck); +} + +EXTERN +void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *lck) { + PRINT0(LD_IO, "call to kmpc_end_critical()\n"); + omp_unset_lock((omp_lock_t *)lck); +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu index f6523c8ce8aa2..0e10a6a2364d0 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/data_sharing.cu @@ -1,568 +1,568 @@ -//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of data sharing environments -// -//===----------------------------------------------------------------------===// -#include "common/omptarget.h" -#include "target_impl.h" - -// Return true if this is the master thread. -INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { - return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); -} - -/// Return the provided size aligned to the size of a pointer. -INLINE static size_t AlignVal(size_t Val) { - const size_t Align = (size_t)sizeof(void *); - if (Val & (Align - 1)) { - Val += Align; - Val &= ~(Align - 1); - } - return Val; -} - -#define DSFLAG 0 -#define DSFLAG_INIT 0 -#define DSPRINT(_flag, _str, _args...) \ - { \ - if (_flag) { \ - /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \ - } \ - } -#define DSPRINT0(_flag, _str) \ - { \ - if (_flag) { \ - /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \ - } \ - } - -// Initialize the shared data structures. This is expected to be called for the -// master thread and warp masters. \param RootS: A pointer to the root of the -// data sharing stack. \param InitialDataSize: The initial size of the data in -// the slot. -EXTERN void -__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, - size_t InitialDataSize) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - DSPRINT0(DSFLAG_INIT, - "Entering __kmpc_initialize_data_sharing_environment\n"); - - unsigned WID = GetWarpId(); - DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); - - omptarget_nvptx_TeamDescr *teamDescr = - &omptarget_nvptx_threadPrivateContext->TeamContext(); - __kmpc_data_sharing_slot *RootS = - teamDescr->RootS(WID, IsMasterThread(isSPMDMode())); - - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - - // We don't need to initialize the frame and active threads. - - DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize); - DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS); - DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n", - (unsigned long long)RootS->DataEnd); - DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", - (unsigned long long)RootS->Next); - DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n", - (unsigned long long)DataSharingState.SlotPtr[WID]); - DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n", - (unsigned long long)DataSharingState.StackPtr[WID]); - - DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n"); -} - -EXTERN void *__kmpc_data_sharing_environment_begin( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - size_t SharingDataSize, size_t SharingDefaultDataSize, - int16_t IsOMPRuntimeInitialized) { - - DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n"); - - // If the runtime has been elided, used shared memory for master-worker - // data sharing. - if (!IsOMPRuntimeInitialized) - return (void *)&DataSharingState; - - DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); - DSPRINT(DSFLAG, "Default Data Size %016llx\n", - (unsigned long long)SharingDefaultDataSize); - - unsigned WID = GetWarpId(); - __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask(); - - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - void * volatile &FrameP = DataSharingState.FramePtr[WID]; - __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID]; - - DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); - // Save the current values. - *SavedSharedSlot = SlotP; - *SavedSharedStack = StackP; - *SavedSharedFrame = FrameP; - *SavedActiveThreads = ActiveT; - - DSPRINT(DSFLAG, "Warp ID: %u\n", WID); - DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP); - DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP); - DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP); - DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); - - // Only the warp active master needs to grow the stack. - if (__kmpc_impl_is_first_active_thread()) { - // Save the current active threads. - ActiveT = CurActiveThreads; - - // Make sure we use aligned sizes to avoid rematerialization of data. - SharingDataSize = AlignVal(SharingDataSize); - // FIXME: The default data size can be assumed to be aligned? - SharingDefaultDataSize = AlignVal(SharingDefaultDataSize); - - // Check if we have room for the data in the current slot. - const uintptr_t CurrentStartAddress = (uintptr_t)StackP; - const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequiredEndAddress = - CurrentStartAddress + (uintptr_t)SharingDataSize; - - DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); - DSPRINT(DSFLAG, "Default Data Size %016llx\n", - (unsigned long long)SharingDefaultDataSize); - DSPRINT(DSFLAG, "Current Start Address %016llx\n", - (unsigned long long)CurrentStartAddress); - DSPRINT(DSFLAG, "Current End Address %016llx\n", - (unsigned long long)CurrentEndAddress); - DSPRINT(DSFLAG, "Required End Address %016llx\n", - (unsigned long long)RequiredEndAddress); - DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT); - - // If we require a new slot, allocate it and initialize it (or attempt to - // reuse one). Also, set the shared stack and slot pointers to the new - // place. If we do not need to grow the stack, just adapt the stack and - // frame pointers. - if (CurrentEndAddress < RequiredEndAddress) { - size_t NewSize = (SharingDataSize > SharingDefaultDataSize) - ? SharingDataSize - : SharingDefaultDataSize; - __kmpc_data_sharing_slot *NewSlot = 0; - - // Attempt to reuse an existing slot. - if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { - uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - - (uintptr_t)(&ExistingSlot->Data[0]); - if (ExistingSlotSize >= NewSize) { - DSPRINT(DSFLAG, "Reusing stack slot %016llx\n", - (unsigned long long)ExistingSlot); - NewSlot = ExistingSlot; - } else { - DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n", - (unsigned long long)SlotP->Next); - SafeFree(ExistingSlot, "Failed reuse"); - } - } - - if (!NewSlot) { - NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc( - sizeof(__kmpc_data_sharing_slot) + NewSize, - "Warp master slot allocation"); - DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n", - (unsigned long long)NewSlot, NewSize); - } - - NewSlot->Next = 0; - NewSlot->DataEnd = &NewSlot->Data[NewSize]; - - SlotP->Next = NewSlot; - SlotP = NewSlot; - StackP = &NewSlot->Data[SharingDataSize]; - FrameP = &NewSlot->Data[0]; - } else { - - // Clean up any old slot that we may still have. The slot producers, do - // not eliminate them because that may be used to return data. - if (SlotP->Next) { - DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n", - (unsigned long long)SlotP->Next); - SafeFree(SlotP->Next, "Old slot not required"); - SlotP->Next = 0; - } - - FrameP = StackP; - StackP = (void *)RequiredEndAddress; - } - } - - // FIXME: Need to see the impact of doing it here. - __kmpc_impl_threadfence_block(); - - DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n"); - - // All the threads in this warp get the frame they should work with. - return FrameP; -} - -EXTERN void __kmpc_data_sharing_environment_end( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - int32_t IsEntryPoint) { - - DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); - - unsigned WID = GetWarpId(); - - if (IsEntryPoint) { - if (__kmpc_impl_is_first_active_thread()) { - DSPRINT0(DSFLAG, "Doing clean up\n"); - - // The master thread cleans the saved slot, because this is an environment - // only for the master. - __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode()) - ? *SavedSharedSlot - : DataSharingState.SlotPtr[WID]; - - if (S->Next) { - SafeFree(S->Next, "Sharing environment end"); - S->Next = 0; - } - } - - DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n"); - return; - } - - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - - // Only the warp master can restore the stack and frame information, and only - // if there are no other threads left behind in this environment (i.e. the - // warp diverged and returns in different places). This only works if we - // assume that threads will converge right after the call site that started - // the environment. - if (__kmpc_impl_is_first_active_thread()) { - __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID]; - - DSPRINT0(DSFLAG, "Before restoring the stack\n"); - // Zero the bits in the mask. If it is still different from zero, then we - // have other threads that will return after the current ones. - ActiveT &= ~CurActive; - - DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", - (unsigned)CurActive, (unsigned)ActiveT); - - if (!ActiveT) { - // No other active threads? Great, lets restore the stack. - - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - void * volatile &FrameP = DataSharingState.FramePtr[WID]; - - SlotP = *SavedSharedSlot; - StackP = *SavedSharedStack; - FrameP = *SavedSharedFrame; - ActiveT = *SavedActiveThreads; - - DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", - (unsigned long long)SlotP); - DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", - (unsigned long long)StackP); - DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", - (unsigned long long)FrameP); - DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); - } - } - - // FIXME: Need to see the impact of doing it here. - __kmpc_impl_threadfence_block(); - - DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n"); - return; -} - -EXTERN void * -__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, - int16_t IsOMPRuntimeInitialized) { - DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n"); - - // If the runtime has been elided, use shared memory for master-worker - // data sharing. We're reusing the statically allocated data structure - // that is used for standard data sharing. - if (!IsOMPRuntimeInitialized) - return (void *)&DataSharingState; - - // Get the frame used by the requested thread. - - unsigned SourceWID = SourceThreadID / WARPSIZE; - - DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID); - - void * volatile P = DataSharingState.FramePtr[SourceWID]; - DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); - return P; -} - -//////////////////////////////////////////////////////////////////////////////// -// Runtime functions for trunk data sharing scheme. -//////////////////////////////////////////////////////////////////////////////// - -INLINE static void data_sharing_init_stack_common() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - omptarget_nvptx_TeamDescr *teamDescr = - &omptarget_nvptx_threadPrivateContext->TeamContext(); - - for (int WID = 0; WID < WARPSIZE; WID++) { - __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - } -} - -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called only by the MASTER thread of each -// team in non-SPMD mode. -EXTERN void __kmpc_data_sharing_init_stack() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - data_sharing_init_stack_common(); - omptarget_nvptx_globalArgs.Init(); -} - -// Initialize data sharing data structure. This function needs to be called -// once at the beginning of a data sharing context (coincides with the kernel -// initialization). This function is called in SPMD mode only. -EXTERN void __kmpc_data_sharing_init_stack_spmd() { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); - // This function initializes the stack pointer with the pointer to the - // statically allocated shared memory slots. The size of a shared memory - // slot is pre-determined to be 256 bytes. - if (GetThreadIdInBlock() == 0) - data_sharing_init_stack_common(); - - __kmpc_impl_threadfence_block(); -} - -INLINE static void* data_sharing_push_stack_common(size_t PushSize) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - // Only warp active master threads manage the stack. - bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; - - // Add worst-case padding to DataSize so that future stack allocations are - // correctly aligned. - const size_t Alignment = 8; - PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; - - // Frame pointer must be visible to all workers in the same warp. - const unsigned WID = GetWarpId(); - void *FrameP = 0; - __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); - - if (IsWarpMaster) { - // SlotP will point to either the shared memory slot or an existing - // global memory slot. - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - void *&StackP = DataSharingState.StackPtr[WID]; - - // Check if we have room for the data in the current slot. - const uintptr_t StartAddress = (uintptr_t)StackP; - const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; - const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; - - // If we requested more data than there is room for in the rest - // of the slot then we need to either re-use the next slot, if one exists, - // or create a new slot. - if (EndAddress < RequestedEndAddress) { - __kmpc_data_sharing_slot *NewSlot = 0; - size_t NewSize = PushSize; - - // Allocate at least the default size for each type of slot. - // Master is a special case and even though there is only one thread, - // it can share more things with the workers. For uniformity, it uses - // the full size of a worker warp slot. - size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; - if (DefaultSlotSize > NewSize) - NewSize = DefaultSlotSize; - NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc( - sizeof(__kmpc_data_sharing_slot) + NewSize, - "Global memory slot allocation."); - - NewSlot->Next = 0; - NewSlot->Prev = SlotP; - NewSlot->PrevSlotStackPtr = StackP; - NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; - - // Make previous slot point to the newly allocated slot. - SlotP->Next = NewSlot; - // The current slot becomes the new slot. - SlotP = NewSlot; - // The stack pointer always points to the next free stack frame. - StackP = &NewSlot->Data[0] + PushSize; - // The frame pointer always points to the beginning of the frame. - FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0]; - } else { - // Add the data chunk to the current slot. The frame pointer is set to - // point to the start of the new frame held in StackP. - FrameP = DataSharingState.FramePtr[WID] = StackP; - // Reset stack pointer to the requested address. - StackP = (void *)RequestedEndAddress; - } - } - // Get address from lane 0. - int *FP = (int *)&FrameP; - FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); - if (sizeof(FrameP) == 8) - FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); - - return FrameP; -} - -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - return data_sharing_push_stack_common(DataSize); -} - -// Called at the time of the kernel initialization. This is used to initilize -// the list of references to shared variables and to pre-allocate global storage -// for holding the globalized variables. -// -// By default the globalized variables are stored in global memory. If the -// UseSharedMemory is set to true, the runtime will attempt to use shared memory -// as long as the size requested fits the pre-allocated size. -EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, - int16_t UseSharedMemory) { - // Compute the total memory footprint of the requested data. - // The master thread requires a stack only for itself. A worker - // thread (which at this point is a warp master) will require - // space for the variables of each thread in the warp, - // i.e. one DataSize chunk per warp lane. - // TODO: change WARPSIZE to the number of active threads in the warp. - size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) - ? DataSize - : WARPSIZE * DataSize; - - // Compute the start address of the frame of each thread in the warp. - uintptr_t FrameStartAddress = - (uintptr_t) data_sharing_push_stack_common(PushSize); - FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize); - return (void *)FrameStartAddress; -} - -// Pop the stack and free any memory which can be reclaimed. -// -// When the pop operation removes the last global memory slot, -// reclaim all outstanding global memory slots since it is -// likely we have reached the end of the kernel. -EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - __kmpc_impl_threadfence_block(); - - if (GetThreadIdInBlock() % WARPSIZE == 0) { - unsigned WID = GetWarpId(); - - // Current slot - __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; - - // Pointer to next available stack. - void *&StackP = DataSharingState.StackPtr[WID]; - - // Pop the frame. - StackP = FrameStart; - - // If the current slot is empty, we need to free the slot after the - // pop. - bool SlotEmpty = (StackP == &SlotP->Data[0]); - - if (SlotEmpty && SlotP->Prev) { - // Before removing the slot we need to reset StackP. - StackP = SlotP->PrevSlotStackPtr; - - // Remove the slot. - SlotP = SlotP->Prev; - SafeFree(SlotP->Next, "Free slot."); - SlotP->Next = 0; - } - } -} - -// Begin a data sharing context. Maintain a list of references to shared -// variables. This list of references to shared variables will be passed -// to one or more threads. -// In L0 data sharing this is called by master thread. -// In L1 data sharing this is called by active warp master thread. -EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { - omptarget_nvptx_globalArgs.EnsureSize(nArgs); - *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); -} - -// End a data sharing context. There is no need to have a list of refs -// to shared variables because the context in which those variables were -// shared has now ended. This should clean-up the list of references only -// without affecting the actual global storage of the variables. -// In L0 data sharing this is called by master thread. -// In L1 data sharing this is called by active warp master thread. -EXTERN void __kmpc_end_sharing_variables() { - omptarget_nvptx_globalArgs.DeInit(); -} - -// This function will return a list of references to global variables. This -// is how the workers will get a reference to the globalized variable. The -// members of this list will be passed to the outlined parallel function -// preserving the order. -// Called by all workers. -EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { - *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); -} - -// This function is used to init static memory manager. This manager is used to -// manage statically allocated global memory. This memory is allocated by the -// compiler and used to correctly implement globalization of the variables in -// target, teams and distribute regions. -EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, - const void *buf, size_t size, - int16_t is_shared, - const void **frame) { - if (is_shared) { - *frame = buf; - return; - } - if (isSPMDExecutionMode) { - if (GetThreadIdInBlock() == 0) { - *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); - } - __kmpc_impl_syncthreads(); - return; - } - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), - "Must be called only in the target master thread."); - *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); - __kmpc_impl_threadfence(); -} - -EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, - int16_t is_shared) { - if (is_shared) - return; - if (isSPMDExecutionMode) { - __kmpc_impl_syncthreads(); - if (GetThreadIdInBlock() == 0) { - omptarget_nvptx_simpleMemoryManager.Release(); - } - return; - } - __kmpc_impl_threadfence(); - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), - "Must be called only in the target master thread."); - omptarget_nvptx_simpleMemoryManager.Release(); -} - +//===----- data_sharing.cu - OpenMP GPU data sharing ------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of data sharing environments +// +//===----------------------------------------------------------------------===// +#include "common/omptarget.h" +#include "target_impl.h" + +// Return true if this is the master thread. +INLINE static bool IsMasterThread(bool isSPMDExecutionMode) { + return !isSPMDExecutionMode && GetMasterThreadID() == GetThreadIdInBlock(); +} + +/// Return the provided size aligned to the size of a pointer. +INLINE static size_t AlignVal(size_t Val) { + const size_t Align = (size_t)sizeof(void *); + if (Val & (Align - 1)) { + Val += Align; + Val &= ~(Align - 1); + } + return Val; +} + +#define DSFLAG 0 +#define DSFLAG_INIT 0 +#define DSPRINT(_flag, _str, _args...) \ + { \ + if (_flag) { \ + /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x, _args);*/ \ + } \ + } +#define DSPRINT0(_flag, _str) \ + { \ + if (_flag) { \ + /*printf("(%d,%d) -> " _str, blockIdx.x, threadIdx.x);*/ \ + } \ + } + +// Initialize the shared data structures. This is expected to be called for the +// master thread and warp masters. \param RootS: A pointer to the root of the +// data sharing stack. \param InitialDataSize: The initial size of the data in +// the slot. +EXTERN void +__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *rootS, + size_t InitialDataSize) { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + DSPRINT0(DSFLAG_INIT, + "Entering __kmpc_initialize_data_sharing_environment\n"); + + unsigned WID = GetWarpId(); + DSPRINT(DSFLAG_INIT, "Warp ID: %u\n", WID); + + omptarget_nvptx_TeamDescr *teamDescr = + &omptarget_nvptx_threadPrivateContext->TeamContext(); + __kmpc_data_sharing_slot *RootS = + teamDescr->RootS(WID, IsMasterThread(isSPMDMode())); + + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + + // We don't need to initialize the frame and active threads. + + DSPRINT(DSFLAG_INIT, "Initial data size: %08x \n", (unsigned)InitialDataSize); + DSPRINT(DSFLAG_INIT, "Root slot at: %016llx \n", (unsigned long long)RootS); + DSPRINT(DSFLAG_INIT, "Root slot data-end at: %016llx \n", + (unsigned long long)RootS->DataEnd); + DSPRINT(DSFLAG_INIT, "Root slot next at: %016llx \n", + (unsigned long long)RootS->Next); + DSPRINT(DSFLAG_INIT, "Shared slot ptr at: %016llx \n", + (unsigned long long)DataSharingState.SlotPtr[WID]); + DSPRINT(DSFLAG_INIT, "Shared stack ptr at: %016llx \n", + (unsigned long long)DataSharingState.StackPtr[WID]); + + DSPRINT0(DSFLAG_INIT, "Exiting __kmpc_initialize_data_sharing_environment\n"); +} + +EXTERN void *__kmpc_data_sharing_environment_begin( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, + size_t SharingDataSize, size_t SharingDefaultDataSize, + int16_t IsOMPRuntimeInitialized) { + + DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_begin\n"); + + // If the runtime has been elided, used shared memory for master-worker + // data sharing. + if (!IsOMPRuntimeInitialized) + return (void *)&DataSharingState; + + DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", + (unsigned long long)SharingDefaultDataSize); + + unsigned WID = GetWarpId(); + __kmpc_impl_lanemask_t CurActiveThreads = __kmpc_impl_activemask(); + + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; + __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID]; + + DSPRINT0(DSFLAG, "Save current slot/stack values.\n"); + // Save the current values. + *SavedSharedSlot = SlotP; + *SavedSharedStack = StackP; + *SavedSharedFrame = FrameP; + *SavedActiveThreads = ActiveT; + + DSPRINT(DSFLAG, "Warp ID: %u\n", WID); + DSPRINT(DSFLAG, "Saved slot ptr at: %016llx \n", (unsigned long long)SlotP); + DSPRINT(DSFLAG, "Saved stack ptr at: %016llx \n", (unsigned long long)StackP); + DSPRINT(DSFLAG, "Saved frame ptr at: %016llx \n", (long long)FrameP); + DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); + + // Only the warp active master needs to grow the stack. + if (__kmpc_impl_is_first_active_thread()) { + // Save the current active threads. + ActiveT = CurActiveThreads; + + // Make sure we use aligned sizes to avoid rematerialization of data. + SharingDataSize = AlignVal(SharingDataSize); + // FIXME: The default data size can be assumed to be aligned? + SharingDefaultDataSize = AlignVal(SharingDefaultDataSize); + + // Check if we have room for the data in the current slot. + const uintptr_t CurrentStartAddress = (uintptr_t)StackP; + const uintptr_t CurrentEndAddress = (uintptr_t)SlotP->DataEnd; + const uintptr_t RequiredEndAddress = + CurrentStartAddress + (uintptr_t)SharingDataSize; + + DSPRINT(DSFLAG, "Data Size %016llx\n", (unsigned long long)SharingDataSize); + DSPRINT(DSFLAG, "Default Data Size %016llx\n", + (unsigned long long)SharingDefaultDataSize); + DSPRINT(DSFLAG, "Current Start Address %016llx\n", + (unsigned long long)CurrentStartAddress); + DSPRINT(DSFLAG, "Current End Address %016llx\n", + (unsigned long long)CurrentEndAddress); + DSPRINT(DSFLAG, "Required End Address %016llx\n", + (unsigned long long)RequiredEndAddress); + DSPRINT(DSFLAG, "Active Threads %08x\n", (unsigned)ActiveT); + + // If we require a new slot, allocate it and initialize it (or attempt to + // reuse one). Also, set the shared stack and slot pointers to the new + // place. If we do not need to grow the stack, just adapt the stack and + // frame pointers. + if (CurrentEndAddress < RequiredEndAddress) { + size_t NewSize = (SharingDataSize > SharingDefaultDataSize) + ? SharingDataSize + : SharingDefaultDataSize; + __kmpc_data_sharing_slot *NewSlot = 0; + + // Attempt to reuse an existing slot. + if (__kmpc_data_sharing_slot *ExistingSlot = SlotP->Next) { + uintptr_t ExistingSlotSize = (uintptr_t)ExistingSlot->DataEnd - + (uintptr_t)(&ExistingSlot->Data[0]); + if (ExistingSlotSize >= NewSize) { + DSPRINT(DSFLAG, "Reusing stack slot %016llx\n", + (unsigned long long)ExistingSlot); + NewSlot = ExistingSlot; + } else { + DSPRINT(DSFLAG, "Cleaning up -failed reuse - %016llx\n", + (unsigned long long)SlotP->Next); + SafeFree(ExistingSlot, "Failed reuse"); + } + } + + if (!NewSlot) { + NewSlot = (__kmpc_data_sharing_slot *)SafeMalloc( + sizeof(__kmpc_data_sharing_slot) + NewSize, + "Warp master slot allocation"); + DSPRINT(DSFLAG, "New slot allocated %016llx (data size=%016llx)\n", + (unsigned long long)NewSlot, NewSize); + } + + NewSlot->Next = 0; + NewSlot->DataEnd = &NewSlot->Data[NewSize]; + + SlotP->Next = NewSlot; + SlotP = NewSlot; + StackP = &NewSlot->Data[SharingDataSize]; + FrameP = &NewSlot->Data[0]; + } else { + + // Clean up any old slot that we may still have. The slot producers, do + // not eliminate them because that may be used to return data. + if (SlotP->Next) { + DSPRINT(DSFLAG, "Cleaning up - old not required - %016llx\n", + (unsigned long long)SlotP->Next); + SafeFree(SlotP->Next, "Old slot not required"); + SlotP->Next = 0; + } + + FrameP = StackP; + StackP = (void *)RequiredEndAddress; + } + } + + // FIXME: Need to see the impact of doing it here. + __kmpc_impl_threadfence_block(); + + DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_begin\n"); + + // All the threads in this warp get the frame they should work with. + return FrameP; +} + +EXTERN void __kmpc_data_sharing_environment_end( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, + int32_t IsEntryPoint) { + + DSPRINT0(DSFLAG, "Entering __kmpc_data_sharing_environment_end\n"); + + unsigned WID = GetWarpId(); + + if (IsEntryPoint) { + if (__kmpc_impl_is_first_active_thread()) { + DSPRINT0(DSFLAG, "Doing clean up\n"); + + // The master thread cleans the saved slot, because this is an environment + // only for the master. + __kmpc_data_sharing_slot *S = IsMasterThread(isSPMDMode()) + ? *SavedSharedSlot + : DataSharingState.SlotPtr[WID]; + + if (S->Next) { + SafeFree(S->Next, "Sharing environment end"); + S->Next = 0; + } + } + + DSPRINT0(DSFLAG, "Exiting Exiting __kmpc_data_sharing_environment_end\n"); + return; + } + + __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); + + // Only the warp master can restore the stack and frame information, and only + // if there are no other threads left behind in this environment (i.e. the + // warp diverged and returns in different places). This only works if we + // assume that threads will converge right after the call site that started + // the environment. + if (__kmpc_impl_is_first_active_thread()) { + __kmpc_impl_lanemask_t &ActiveT = DataSharingState.ActiveThreads[WID]; + + DSPRINT0(DSFLAG, "Before restoring the stack\n"); + // Zero the bits in the mask. If it is still different from zero, then we + // have other threads that will return after the current ones. + ActiveT &= ~CurActive; + + DSPRINT(DSFLAG, "Active threads: %08x; New mask: %08x\n", + (unsigned)CurActive, (unsigned)ActiveT); + + if (!ActiveT) { + // No other active threads? Great, lets restore the stack. + + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + void * volatile &FrameP = DataSharingState.FramePtr[WID]; + + SlotP = *SavedSharedSlot; + StackP = *SavedSharedStack; + FrameP = *SavedSharedFrame; + ActiveT = *SavedActiveThreads; + + DSPRINT(DSFLAG, "Restored slot ptr at: %016llx \n", + (unsigned long long)SlotP); + DSPRINT(DSFLAG, "Restored stack ptr at: %016llx \n", + (unsigned long long)StackP); + DSPRINT(DSFLAG, "Restored frame ptr at: %016llx \n", + (unsigned long long)FrameP); + DSPRINT(DSFLAG, "Active threads: %08x \n", (unsigned)ActiveT); + } + } + + // FIXME: Need to see the impact of doing it here. + __kmpc_impl_threadfence_block(); + + DSPRINT0(DSFLAG, "Exiting __kmpc_data_sharing_environment_end\n"); + return; +} + +EXTERN void * +__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, + int16_t IsOMPRuntimeInitialized) { + DSPRINT0(DSFLAG, "Entering __kmpc_get_data_sharing_environment_frame\n"); + + // If the runtime has been elided, use shared memory for master-worker + // data sharing. We're reusing the statically allocated data structure + // that is used for standard data sharing. + if (!IsOMPRuntimeInitialized) + return (void *)&DataSharingState; + + // Get the frame used by the requested thread. + + unsigned SourceWID = SourceThreadID / WARPSIZE; + + DSPRINT(DSFLAG, "Source warp: %u\n", SourceWID); + + void * volatile P = DataSharingState.FramePtr[SourceWID]; + DSPRINT0(DSFLAG, "Exiting __kmpc_get_data_sharing_environment_frame\n"); + return P; +} + +//////////////////////////////////////////////////////////////////////////////// +// Runtime functions for trunk data sharing scheme. +//////////////////////////////////////////////////////////////////////////////// + +INLINE static void data_sharing_init_stack_common() { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + omptarget_nvptx_TeamDescr *teamDescr = + &omptarget_nvptx_threadPrivateContext->TeamContext(); + + for (int WID = 0; WID < WARPSIZE; WID++) { + __kmpc_data_sharing_slot *RootS = teamDescr->GetPreallocatedSlotAddr(WID); + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + } +} + +// Initialize data sharing data structure. This function needs to be called +// once at the beginning of a data sharing context (coincides with the kernel +// initialization). This function is called only by the MASTER thread of each +// team in non-SPMD mode. +EXTERN void __kmpc_data_sharing_init_stack() { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + // This function initializes the stack pointer with the pointer to the + // statically allocated shared memory slots. The size of a shared memory + // slot is pre-determined to be 256 bytes. + data_sharing_init_stack_common(); + omptarget_nvptx_globalArgs.Init(); +} + +// Initialize data sharing data structure. This function needs to be called +// once at the beginning of a data sharing context (coincides with the kernel +// initialization). This function is called in SPMD mode only. +EXTERN void __kmpc_data_sharing_init_stack_spmd() { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Runtime must be initialized."); + // This function initializes the stack pointer with the pointer to the + // statically allocated shared memory slots. The size of a shared memory + // slot is pre-determined to be 256 bytes. + if (GetThreadIdInBlock() == 0) + data_sharing_init_stack_common(); + + __kmpc_impl_threadfence_block(); +} + +INLINE static void* data_sharing_push_stack_common(size_t PushSize) { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + + // Only warp active master threads manage the stack. + bool IsWarpMaster = (GetThreadIdInBlock() % WARPSIZE) == 0; + + // Add worst-case padding to DataSize so that future stack allocations are + // correctly aligned. + const size_t Alignment = 8; + PushSize = (PushSize + (Alignment - 1)) / Alignment * Alignment; + + // Frame pointer must be visible to all workers in the same warp. + const unsigned WID = GetWarpId(); + void *FrameP = 0; + __kmpc_impl_lanemask_t CurActive = __kmpc_impl_activemask(); + + if (IsWarpMaster) { + // SlotP will point to either the shared memory slot or an existing + // global memory slot. + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + void *&StackP = DataSharingState.StackPtr[WID]; + + // Check if we have room for the data in the current slot. + const uintptr_t StartAddress = (uintptr_t)StackP; + const uintptr_t EndAddress = (uintptr_t)SlotP->DataEnd; + const uintptr_t RequestedEndAddress = StartAddress + (uintptr_t)PushSize; + + // If we requested more data than there is room for in the rest + // of the slot then we need to either re-use the next slot, if one exists, + // or create a new slot. + if (EndAddress < RequestedEndAddress) { + __kmpc_data_sharing_slot *NewSlot = 0; + size_t NewSize = PushSize; + + // Allocate at least the default size for each type of slot. + // Master is a special case and even though there is only one thread, + // it can share more things with the workers. For uniformity, it uses + // the full size of a worker warp slot. + size_t DefaultSlotSize = DS_Worker_Warp_Slot_Size; + if (DefaultSlotSize > NewSize) + NewSize = DefaultSlotSize; + NewSlot = (__kmpc_data_sharing_slot *) SafeMalloc( + sizeof(__kmpc_data_sharing_slot) + NewSize, + "Global memory slot allocation."); + + NewSlot->Next = 0; + NewSlot->Prev = SlotP; + NewSlot->PrevSlotStackPtr = StackP; + NewSlot->DataEnd = &NewSlot->Data[0] + NewSize; + + // Make previous slot point to the newly allocated slot. + SlotP->Next = NewSlot; + // The current slot becomes the new slot. + SlotP = NewSlot; + // The stack pointer always points to the next free stack frame. + StackP = &NewSlot->Data[0] + PushSize; + // The frame pointer always points to the beginning of the frame. + FrameP = DataSharingState.FramePtr[WID] = &NewSlot->Data[0]; + } else { + // Add the data chunk to the current slot. The frame pointer is set to + // point to the start of the new frame held in StackP. + FrameP = DataSharingState.FramePtr[WID] = StackP; + // Reset stack pointer to the requested address. + StackP = (void *)RequestedEndAddress; + } + } + // Get address from lane 0. + int *FP = (int *)&FrameP; + FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0); + if (sizeof(FrameP) == 8) + FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0); + + return FrameP; +} + +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + return data_sharing_push_stack_common(DataSize); +} + +// Called at the time of the kernel initialization. This is used to initilize +// the list of references to shared variables and to pre-allocate global storage +// for holding the globalized variables. +// +// By default the globalized variables are stored in global memory. If the +// UseSharedMemory is set to true, the runtime will attempt to use shared memory +// as long as the size requested fits the pre-allocated size. +EXTERN void *__kmpc_data_sharing_push_stack(size_t DataSize, + int16_t UseSharedMemory) { + // Compute the total memory footprint of the requested data. + // The master thread requires a stack only for itself. A worker + // thread (which at this point is a warp master) will require + // space for the variables of each thread in the warp, + // i.e. one DataSize chunk per warp lane. + // TODO: change WARPSIZE to the number of active threads in the warp. + size_t PushSize = (isRuntimeUninitialized() || IsMasterThread(isSPMDMode())) + ? DataSize + : WARPSIZE * DataSize; + + // Compute the start address of the frame of each thread in the warp. + uintptr_t FrameStartAddress = + (uintptr_t) data_sharing_push_stack_common(PushSize); + FrameStartAddress += (uintptr_t) (GetLaneId() * DataSize); + return (void *)FrameStartAddress; +} + +// Pop the stack and free any memory which can be reclaimed. +// +// When the pop operation removes the last global memory slot, +// reclaim all outstanding global memory slots since it is +// likely we have reached the end of the kernel. +EXTERN void __kmpc_data_sharing_pop_stack(void *FrameStart) { + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + + __kmpc_impl_threadfence_block(); + + if (GetThreadIdInBlock() % WARPSIZE == 0) { + unsigned WID = GetWarpId(); + + // Current slot + __kmpc_data_sharing_slot *&SlotP = DataSharingState.SlotPtr[WID]; + + // Pointer to next available stack. + void *&StackP = DataSharingState.StackPtr[WID]; + + // Pop the frame. + StackP = FrameStart; + + // If the current slot is empty, we need to free the slot after the + // pop. + bool SlotEmpty = (StackP == &SlotP->Data[0]); + + if (SlotEmpty && SlotP->Prev) { + // Before removing the slot we need to reset StackP. + StackP = SlotP->PrevSlotStackPtr; + + // Remove the slot. + SlotP = SlotP->Prev; + SafeFree(SlotP->Next, "Free slot."); + SlotP->Next = 0; + } + } +} + +// Begin a data sharing context. Maintain a list of references to shared +// variables. This list of references to shared variables will be passed +// to one or more threads. +// In L0 data sharing this is called by master thread. +// In L1 data sharing this is called by active warp master thread. +EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs) { + omptarget_nvptx_globalArgs.EnsureSize(nArgs); + *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +} + +// End a data sharing context. There is no need to have a list of refs +// to shared variables because the context in which those variables were +// shared has now ended. This should clean-up the list of references only +// without affecting the actual global storage of the variables. +// In L0 data sharing this is called by master thread. +// In L1 data sharing this is called by active warp master thread. +EXTERN void __kmpc_end_sharing_variables() { + omptarget_nvptx_globalArgs.DeInit(); +} + +// This function will return a list of references to global variables. This +// is how the workers will get a reference to the globalized variable. The +// members of this list will be passed to the outlined parallel function +// preserving the order. +// Called by all workers. +EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs) { + *GlobalArgs = omptarget_nvptx_globalArgs.GetArgs(); +} + +// This function is used to init static memory manager. This manager is used to +// manage statically allocated global memory. This memory is allocated by the +// compiler and used to correctly implement globalization of the variables in +// target, teams and distribute regions. +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, + int16_t is_shared, + const void **frame) { + if (is_shared) { + *frame = buf; + return; + } + if (isSPMDExecutionMode) { + if (GetThreadIdInBlock() == 0) { + *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); + } + __kmpc_impl_syncthreads(); + return; + } + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "Must be called only in the target master thread."); + *frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size); + __kmpc_impl_threadfence(); +} + +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared) { + if (is_shared) + return; + if (isSPMDExecutionMode) { + __kmpc_impl_syncthreads(); + if (GetThreadIdInBlock() == 0) { + omptarget_nvptx_simpleMemoryManager.Release(); + } + return; + } + __kmpc_impl_threadfence(); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "Must be called only in the target master thread."); + omptarget_nvptx_simpleMemoryManager.Release(); +} + diff --git a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu index 89c481bcf8da3..c3cc51c7c3625 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/libcall.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/libcall.cu @@ -1,414 +1,414 @@ -//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements the OpenMP runtime functions that can be -// invoked by the user in an OpenMP region -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" -#include "common/target_atomic.h" -#include "target_impl.h" - -EXTERN double omp_get_wtick(void) { - double rc = __kmpc_impl_get_wtick(); - PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc); - return rc; -} - -EXTERN double omp_get_wtime(void) { - double rc = __kmpc_impl_get_wtime(); - PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc); - return rc; -} - -EXTERN void omp_set_num_threads(int num) { - // Ignore it for SPMD mode. - if (isSPMDMode()) - return; - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); - if (num <= 0) { - WARNING0(LW_INPUT, "expected positive num; ignore\n"); - } else if (parallelLevel[GetWarpId()] == 0) { - nThreads = num; - } -} - -EXTERN int omp_get_num_threads(void) { - int rc = GetNumberOfOmpThreads(isSPMDMode()); - PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); - return rc; -} - -EXTERN int omp_get_max_threads(void) { - if (parallelLevel[GetWarpId()] > 0) - // We're already in parallel region. - return 1; // default is 1 thread avail - // Not currently in a parallel region, return what was set. - int rc = 1; - if (parallelLevel[GetWarpId()] == 0) - rc = nThreads; - ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); - PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); - return rc; -} - -EXTERN int omp_get_thread_limit(void) { - if (isSPMDMode()) - return GetNumberOfThreadsInBlock(); - int rc = threadLimit; - PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); - return rc; -} - -EXTERN int omp_get_thread_num() { - bool isSPMDExecutionMode = isSPMDMode(); - int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - int rc = GetOmpThreadId(tid, isSPMDExecutionMode); - PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); - return rc; -} - -EXTERN int omp_get_num_procs(void) { - int rc = GetNumberOfProcsInDevice(isSPMDMode()); - PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); - return rc; -} - -EXTERN int omp_in_parallel(void) { - int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; - PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); - return rc; -} - -EXTERN int omp_in_final(void) { - // treat all tasks as final... Specs may expect runtime to keep - // track more precisely if a task was actively set by users... This - // is not explicitly specified; will treat as if runtime can - // actively decide to put a non-final task into a final one. - int rc = 1; - PRINT(LD_IO, "call omp_in_final() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_dynamic(int flag) { - PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag); -} - -EXTERN int omp_get_dynamic(void) { - int rc = 0; - PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_nested(int flag) { - PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", - flag); -} - -EXTERN int omp_get_nested(void) { - int rc = 0; - PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_max_active_levels(int level) { - PRINT(LD_IO, - "call omp_set_max_active_levels(%d) is ignored (no nested support)\n", - level); -} - -EXTERN int omp_get_max_active_levels(void) { - int rc = 1; - PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc); - return rc; -} - -EXTERN int omp_get_level(void) { - int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); - PRINT(LD_IO, "call omp_get_level() returns %d\n", level); - return level; -} - -EXTERN int omp_get_active_level(void) { - int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; - PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) - return level; -} - -EXTERN int omp_get_ancestor_thread_num(int level) { - if (isSPMDMode()) - return level == 1 ? GetThreadIdInBlock() : 0; - int rc = -1; - // If level is 0 or all parallel regions are not active - return 0. - unsigned parLevel = parallelLevel[GetWarpId()]; - if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { - int totLevel = omp_get_level(); - if (level <= totLevel) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); - int steps = totLevel - level; - PRINT(LD_IO, "backtrack %d steps\n", steps); - ASSERT0(LT_FUSSY, currTaskDescr, - "do not expect fct to be called in a non-active thread"); - do { - if (DON(LD_IOD)) { - // print current state - omp_sched_t sched = currTaskDescr->GetRuntimeSched(); - PRINT(LD_ALL, - "task descr %s %d: %s, in par %d, rt sched %d," - " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n", - "ancestor", steps, - (currTaskDescr->IsParallelConstruct() ? "par" : "task"), - (int)currTaskDescr->InParallelRegion(), (int)sched, - currTaskDescr->RuntimeChunkSize(), - (int)currTaskDescr->ThreadId(), (int)threadsInTeam, - (int)nThreads); - } - - if (currTaskDescr->IsParallelConstruct()) { - // found the level - if (!steps) { - rc = currTaskDescr->ThreadId(); - break; - } - steps--; - } - currTaskDescr = currTaskDescr->GetPrevTaskDescr(); - } while (currTaskDescr); - ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); - } - } else if (level == 0 || - (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel) || - (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && - level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { - rc = 0; - } - PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, - rc) - return rc; -} - -EXTERN int omp_get_team_size(int level) { - if (isSPMDMode()) - return level == 1 ? GetNumberOfThreadsInBlock() : 1; - int rc = -1; - unsigned parLevel = parallelLevel[GetWarpId()]; - // If level is 0 or all parallel regions are not active - return 1. - if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { - rc = threadsInTeam; - } else if (level == 0 || - (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && - level <= parLevel) || - (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && - level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { - rc = 1; - } - PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) - return rc; -} - -EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode only with uninitialized runtime."); - *kind = omp_sched_static; - *modifier = 1; - } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(isSPMDMode()); - *kind = currTaskDescr->GetRuntimeSched(); - *modifier = currTaskDescr->RuntimeChunkSize(); - } - PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", - (int)*kind, *modifier); -} - -EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { - PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind, - modifier); - if (isRuntimeUninitialized()) { - ASSERT0(LT_FUSSY, isSPMDMode(), - "Expected SPMD mode only with uninitialized runtime."); - return; - } - if (kind >= omp_sched_static && kind < omp_sched_auto) { - omptarget_nvptx_TaskDescr *currTaskDescr = - getMyTopTaskDescriptor(isSPMDMode()); - currTaskDescr->SetRuntimeSched(kind); - currTaskDescr->RuntimeChunkSize() = modifier; - PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n", - (int)currTaskDescr->GetRuntimeSched(), - currTaskDescr->RuntimeChunkSize()); - } -} - -EXTERN omp_proc_bind_t omp_get_proc_bind(void) { - PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n"); - return omp_proc_bind_true; -} - -EXTERN int omp_get_num_places(void) { - PRINT0(LD_IO, "call omp_get_num_places() returns 0\n"); - return 0; -} - -EXTERN int omp_get_place_num_procs(int place_num) { - PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n"); - return 0; -} - -EXTERN void omp_get_place_proc_ids(int place_num, int *ids) { - PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n"); -} - -EXTERN int omp_get_place_num(void) { - PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n"); - return 0; -} - -EXTERN int omp_get_partition_num_places(void) { - PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n"); - return 0; -} - -EXTERN void omp_get_partition_place_nums(int *place_nums) { - PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n"); -} - -EXTERN int omp_get_cancellation(void) { - int rc = 0; - PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc); - return rc; -} - -EXTERN void omp_set_default_device(int deviceId) { - PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n"); -} - -EXTERN int omp_get_default_device(void) { - PRINT0(LD_IO, - "call omp_get_default_device() is undef on device, returns 0\n"); - return 0; -} - -EXTERN int omp_get_num_devices(void) { - PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n"); - return 0; -} - -EXTERN int omp_get_num_teams(void) { - int rc = GetNumberOfOmpTeams(); - PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc); - return rc; -} - -EXTERN int omp_get_team_num() { - int rc = GetOmpTeamId(); - PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc); - return rc; -} - -EXTERN int omp_is_initial_device(void) { - PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); - return 0; // 0 by def on device -} - -// Unspecified on the device. -EXTERN int omp_get_initial_device(void) { - PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n"); - return 0; -} - -// Unused for now. -EXTERN int omp_get_max_task_priority(void) { - PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n"); - return 0; -} - -//////////////////////////////////////////////////////////////////////////////// -// locks -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void omp_init_lock(omp_lock_t *lock) { - __kmpc_impl_init_lock(lock); - PRINT0(LD_IO, "call omp_init_lock()\n"); -} - -EXTERN void omp_destroy_lock(omp_lock_t *lock) { - __kmpc_impl_destroy_lock(lock); - PRINT0(LD_IO, "call omp_destroy_lock()\n"); -} - -EXTERN void omp_set_lock(omp_lock_t *lock) { - __kmpc_impl_set_lock(lock); - PRINT0(LD_IO, "call omp_set_lock()\n"); -} - -EXTERN void omp_unset_lock(omp_lock_t *lock) { - __kmpc_impl_unset_lock(lock); - PRINT0(LD_IO, "call omp_unset_lock()\n"); -} - -EXTERN int omp_test_lock(omp_lock_t *lock) { - int rc = __kmpc_impl_test_lock(lock); - PRINT(LD_IO, "call omp_test_lock() return %d\n", rc); - return rc; -} - -// for xlf Fortran -// Fortran, the return is LOGICAL type - -#define FLOGICAL long -EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() { - int ret = omp_is_initial_device(); - if (ret == 0) - return (FLOGICAL)0; - else - return (FLOGICAL)1; -} - -EXTERN int __xlf_omp_is_initial_device_i4() { - int ret = omp_is_initial_device(); - if (ret == 0) - return 0; - else - return 1; -} - -EXTERN long __xlf_omp_get_team_num_i4() { - int ret = omp_get_team_num(); - return (long)ret; -} - -EXTERN long __xlf_omp_get_num_teams_i4() { - int ret = omp_get_num_teams(); - return (long)ret; -} - -EXTERN void xlf_debug_print_int(int *p) { - printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_long(long *p) { - printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_float(float *p) { - printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_double(double *p) { - printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); -} - -EXTERN void xlf_debug_print_addr(void *p) { - printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p); -} +//===------------ libcall.cu - OpenMP GPU user calls ------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the OpenMP runtime functions that can be +// invoked by the user in an OpenMP region +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" +#include "common/target_atomic.h" +#include "target_impl.h" + +EXTERN double omp_get_wtick(void) { + double rc = __kmpc_impl_get_wtick(); + PRINT(LD_IO, "omp_get_wtick() returns %g\n", rc); + return rc; +} + +EXTERN double omp_get_wtime(void) { + double rc = __kmpc_impl_get_wtime(); + PRINT(LD_IO, "call omp_get_wtime() returns %g\n", rc); + return rc; +} + +EXTERN void omp_set_num_threads(int num) { + // Ignore it for SPMD mode. + if (isSPMDMode()) + return; + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num); + if (num <= 0) { + WARNING0(LW_INPUT, "expected positive num; ignore\n"); + } else if (parallelLevel[GetWarpId()] == 0) { + nThreads = num; + } +} + +EXTERN int omp_get_num_threads(void) { + int rc = GetNumberOfOmpThreads(isSPMDMode()); + PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_max_threads(void) { + if (parallelLevel[GetWarpId()] > 0) + // We're already in parallel region. + return 1; // default is 1 thread avail + // Not currently in a parallel region, return what was set. + int rc = 1; + if (parallelLevel[GetWarpId()] == 0) + rc = nThreads; + ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads"); + PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_thread_limit(void) { + if (isSPMDMode()) + return GetNumberOfThreadsInBlock(); + int rc = threadLimit; + PRINT(LD_IO, "call omp_get_thread_limit() return %d\n", rc); + return rc; +} + +EXTERN int omp_get_thread_num() { + bool isSPMDExecutionMode = isSPMDMode(); + int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + int rc = GetOmpThreadId(tid, isSPMDExecutionMode); + PRINT(LD_IO, "call omp_get_thread_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_num_procs(void) { + int rc = GetNumberOfProcsInDevice(isSPMDMode()); + PRINT(LD_IO, "call omp_get_num_procs() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_parallel(void) { + int rc = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; + PRINT(LD_IO, "call omp_in_parallel() returns %d\n", rc); + return rc; +} + +EXTERN int omp_in_final(void) { + // treat all tasks as final... Specs may expect runtime to keep + // track more precisely if a task was actively set by users... This + // is not explicitly specified; will treat as if runtime can + // actively decide to put a non-final task into a final one. + int rc = 1; + PRINT(LD_IO, "call omp_in_final() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_dynamic(int flag) { + PRINT(LD_IO, "call omp_set_dynamic(%d) is ignored (no support)\n", flag); +} + +EXTERN int omp_get_dynamic(void) { + int rc = 0; + PRINT(LD_IO, "call omp_get_dynamic() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_nested(int flag) { + PRINT(LD_IO, "call omp_set_nested(%d) is ignored (no nested support)\n", + flag); +} + +EXTERN int omp_get_nested(void) { + int rc = 0; + PRINT(LD_IO, "call omp_get_nested() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_max_active_levels(int level) { + PRINT(LD_IO, + "call omp_set_max_active_levels(%d) is ignored (no nested support)\n", + level); +} + +EXTERN int omp_get_max_active_levels(void) { + int rc = 1; + PRINT(LD_IO, "call omp_get_max_active_levels() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_level(void) { + int level = parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); + PRINT(LD_IO, "call omp_get_level() returns %d\n", level); + return level; +} + +EXTERN int omp_get_active_level(void) { + int level = parallelLevel[GetWarpId()] > OMP_ACTIVE_PARALLEL_LEVEL ? 1 : 0; + PRINT(LD_IO, "call omp_get_active_level() returns %d\n", level) + return level; +} + +EXTERN int omp_get_ancestor_thread_num(int level) { + if (isSPMDMode()) + return level == 1 ? GetThreadIdInBlock() : 0; + int rc = -1; + // If level is 0 or all parallel regions are not active - return 0. + unsigned parLevel = parallelLevel[GetWarpId()]; + if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { + int totLevel = omp_get_level(); + if (level <= totLevel) { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false); + int steps = totLevel - level; + PRINT(LD_IO, "backtrack %d steps\n", steps); + ASSERT0(LT_FUSSY, currTaskDescr, + "do not expect fct to be called in a non-active thread"); + do { + if (DON(LD_IOD)) { + // print current state + omp_sched_t sched = currTaskDescr->GetRuntimeSched(); + PRINT(LD_ALL, + "task descr %s %d: %s, in par %d, rt sched %d," + " chunk %" PRIu64 "; tid %d, tnum %d, nthreads %d\n", + "ancestor", steps, + (currTaskDescr->IsParallelConstruct() ? "par" : "task"), + (int)currTaskDescr->InParallelRegion(), (int)sched, + currTaskDescr->RuntimeChunkSize(), + (int)currTaskDescr->ThreadId(), (int)threadsInTeam, + (int)nThreads); + } + + if (currTaskDescr->IsParallelConstruct()) { + // found the level + if (!steps) { + rc = currTaskDescr->ThreadId(); + break; + } + steps--; + } + currTaskDescr = currTaskDescr->GetPrevTaskDescr(); + } while (currTaskDescr); + ASSERT0(LT_FUSSY, !steps, "expected to find all steps"); + } + } else if (level == 0 || + (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && + level <= parLevel) || + (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && + level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { + rc = 0; + } + PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level, + rc) + return rc; +} + +EXTERN int omp_get_team_size(int level) { + if (isSPMDMode()) + return level == 1 ? GetNumberOfThreadsInBlock() : 1; + int rc = -1; + unsigned parLevel = parallelLevel[GetWarpId()]; + // If level is 0 or all parallel regions are not active - return 1. + if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) { + rc = threadsInTeam; + } else if (level == 0 || + (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL && + level <= parLevel) || + (level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL && + level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) { + rc = 1; + } + PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc) + return rc; +} + +EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier) { + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + *kind = omp_sched_static; + *modifier = 1; + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + *kind = currTaskDescr->GetRuntimeSched(); + *modifier = currTaskDescr->RuntimeChunkSize(); + } + PRINT(LD_IO, "call omp_get_schedule returns sched %d and modif %d\n", + (int)*kind, *modifier); +} + +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier) { + PRINT(LD_IO, "call omp_set_schedule(sched %d, modif %d)\n", (int)kind, + modifier); + if (isRuntimeUninitialized()) { + ASSERT0(LT_FUSSY, isSPMDMode(), + "Expected SPMD mode only with uninitialized runtime."); + return; + } + if (kind >= omp_sched_static && kind < omp_sched_auto) { + omptarget_nvptx_TaskDescr *currTaskDescr = + getMyTopTaskDescriptor(isSPMDMode()); + currTaskDescr->SetRuntimeSched(kind); + currTaskDescr->RuntimeChunkSize() = modifier; + PRINT(LD_IOD, "omp_set_schedule did set sched %d & modif %" PRIu64 "\n", + (int)currTaskDescr->GetRuntimeSched(), + currTaskDescr->RuntimeChunkSize()); + } +} + +EXTERN omp_proc_bind_t omp_get_proc_bind(void) { + PRINT0(LD_IO, "call omp_get_proc_bin() is true, regardless on state\n"); + return omp_proc_bind_true; +} + +EXTERN int omp_get_num_places(void) { + PRINT0(LD_IO, "call omp_get_num_places() returns 0\n"); + return 0; +} + +EXTERN int omp_get_place_num_procs(int place_num) { + PRINT0(LD_IO, "call omp_get_place_num_procs() returns 0\n"); + return 0; +} + +EXTERN void omp_get_place_proc_ids(int place_num, int *ids) { + PRINT0(LD_IO, "call to omp_get_place_proc_ids()\n"); +} + +EXTERN int omp_get_place_num(void) { + PRINT0(LD_IO, "call to omp_get_place_num() returns 0\n"); + return 0; +} + +EXTERN int omp_get_partition_num_places(void) { + PRINT0(LD_IO, "call to omp_get_partition_num_places() returns 0\n"); + return 0; +} + +EXTERN void omp_get_partition_place_nums(int *place_nums) { + PRINT0(LD_IO, "call to omp_get_partition_place_nums()\n"); +} + +EXTERN int omp_get_cancellation(void) { + int rc = 0; + PRINT(LD_IO, "call omp_get_cancellation() returns %d\n", rc); + return rc; +} + +EXTERN void omp_set_default_device(int deviceId) { + PRINT0(LD_IO, "call omp_get_default_device() is undef on device\n"); +} + +EXTERN int omp_get_default_device(void) { + PRINT0(LD_IO, + "call omp_get_default_device() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_devices(void) { + PRINT0(LD_IO, "call omp_get_num_devices() is undef on device, returns 0\n"); + return 0; +} + +EXTERN int omp_get_num_teams(void) { + int rc = GetNumberOfOmpTeams(); + PRINT(LD_IO, "call omp_get_num_teams() returns %d\n", rc); + return rc; +} + +EXTERN int omp_get_team_num() { + int rc = GetOmpTeamId(); + PRINT(LD_IO, "call omp_get_team_num() returns %d\n", rc); + return rc; +} + +EXTERN int omp_is_initial_device(void) { + PRINT0(LD_IO, "call omp_is_initial_device() returns 0\n"); + return 0; // 0 by def on device +} + +// Unspecified on the device. +EXTERN int omp_get_initial_device(void) { + PRINT0(LD_IO, "call omp_get_initial_device() returns 0\n"); + return 0; +} + +// Unused for now. +EXTERN int omp_get_max_task_priority(void) { + PRINT0(LD_IO, "call omp_get_max_task_priority() returns 0\n"); + return 0; +} + +//////////////////////////////////////////////////////////////////////////////// +// locks +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void omp_init_lock(omp_lock_t *lock) { + __kmpc_impl_init_lock(lock); + PRINT0(LD_IO, "call omp_init_lock()\n"); +} + +EXTERN void omp_destroy_lock(omp_lock_t *lock) { + __kmpc_impl_destroy_lock(lock); + PRINT0(LD_IO, "call omp_destroy_lock()\n"); +} + +EXTERN void omp_set_lock(omp_lock_t *lock) { + __kmpc_impl_set_lock(lock); + PRINT0(LD_IO, "call omp_set_lock()\n"); +} + +EXTERN void omp_unset_lock(omp_lock_t *lock) { + __kmpc_impl_unset_lock(lock); + PRINT0(LD_IO, "call omp_unset_lock()\n"); +} + +EXTERN int omp_test_lock(omp_lock_t *lock) { + int rc = __kmpc_impl_test_lock(lock); + PRINT(LD_IO, "call omp_test_lock() return %d\n", rc); + return rc; +} + +// for xlf Fortran +// Fortran, the return is LOGICAL type + +#define FLOGICAL long +EXTERN FLOGICAL __xlf_omp_is_initial_device_i8() { + int ret = omp_is_initial_device(); + if (ret == 0) + return (FLOGICAL)0; + else + return (FLOGICAL)1; +} + +EXTERN int __xlf_omp_is_initial_device_i4() { + int ret = omp_is_initial_device(); + if (ret == 0) + return 0; + else + return 1; +} + +EXTERN long __xlf_omp_get_team_num_i4() { + int ret = omp_get_team_num(); + return (long)ret; +} + +EXTERN long __xlf_omp_get_num_teams_i4() { + int ret = omp_get_num_teams(); + return (long)ret; +} + +EXTERN void xlf_debug_print_int(int *p) { + printf("xlf DEBUG %d): %p %d\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_long(long *p) { + printf("xlf DEBUG %d): %p %ld\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_float(float *p) { + printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_double(double *p) { + printf("xlf DEBUG %d): %p %f\n", omp_get_team_num(), p, p == 0 ? 0 : *p); +} + +EXTERN void xlf_debug_print_addr(void *p) { + printf("xlf DEBUG %d): %p \n", omp_get_team_num(), p); +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/loop.cu b/openmp/libomptarget/deviceRTLs/common/src/loop.cu index 417460db138a1..f8acadc8a0dbe 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/loop.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/loop.cu @@ -1,808 +1,808 @@ -//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of the KMPC interface -// for the loop construct plus other worksharing constructs that use the same -// interface as loops. -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" -#include "target_impl.h" -#include "common/target_atomic.h" - -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// -// template class that encapsulate all the helper functions -// -// T is loop iteration type (32 | 64) (unsigned | signed) -// ST is the signed version of T -//////////////////////////////////////////////////////////////////////////////// -//////////////////////////////////////////////////////////////////////////////// - -template class omptarget_nvptx_LoopSupport { -public: - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling with chunk - - // Generic implementation of OMP loop scheduling with static policy - /*! \brief Calculate initial bounds for static loop and stride - * @param[in] loc location in code of the call (not used here) - * @param[in] global_tid global thread id - * @param[in] schetype type of scheduling (see omptarget-nvptx.h) - * @param[in] plastiter pointer to last iteration - * @param[in,out] pointer to loop lower bound. it will contain value of - * lower bound of first chunk - * @param[in,out] pointer to loop upper bound. It will contain value of - * upper bound of first chunk - * @param[in,out] pointer to loop stride. It will contain value of stride - * between two successive chunks executed by the same thread - * @param[in] loop increment bump - * @param[in] chunk size - */ - - // helper function for static chunk - INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, - ST chunk, T entityId, T numberOfEntities) { - // each thread executes multiple chunks all of the same size, except - // the last one - - // distance between two successive chunks - stride = numberOfEntities * chunk; - lb = lb + entityId * chunk; - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - // Say ub' is the begining of the last chunk. Then who ever has a - // lower bound plus a multiple of the increment equal to ub' is - // the last one. - T beginingLastChunk = inputUb - (inputUb % chunk); - last = ((beginingLastChunk - lb) % stride) == 0; - } - - //////////////////////////////////////////////////////////////////////////////// - // Loop with static scheduling without chunk - - // helper function for static no chunk - INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, - ST &chunk, T entityId, - T numberOfEntities) { - // No chunk size specified. Each thread or warp gets at most one - // chunk; chunks are all almost of equal size - T loopSize = ub - lb + 1; - - chunk = loopSize / numberOfEntities; - T leftOver = loopSize - chunk * numberOfEntities; - - if (entityId < leftOver) { - chunk++; - lb = lb + entityId * chunk; - } else { - lb = lb + entityId * chunk + leftOver; - } - - T inputUb = ub; - ub = lb + chunk - 1; // Clang uses i <= ub - last = lb <= inputUb && inputUb <= ub; - stride = loopSize; // make sure we only do 1 chunk per warp - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for Static Init - - INLINE static void for_static_init(int32_t gtid, int32_t schedtype, - int32_t *plastiter, T *plower, T *pupper, - ST *pstride, ST chunk, - bool IsSPMDExecutionMode) { - // When IsRuntimeUninitialized is true, we assume that the caller is - // in an L0 parallel region and that all worker threads participate. - - // Assume we are in teams region or that we use a single block - // per target region - ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode); - - // All warps that are in excess of the maximum requested, do - // not execute the loop - PRINT(LD_LOOP, - "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " - "%d, num tids %d\n", - (int)gtid, (int)schedtype, (long long)chunk, (int)gtid, - (int)numberOfActiveOMPThreads); - ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, - "current thread is not needed here; error"); - - // copy - int lastiter = 0; - T lb = *plower; - T ub = *pupper; - ST stride = *pstride; - // init - switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { - case kmp_sched_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } // note: if chunk <=0, use nochunk - case kmp_sched_static_balanced_chunk: { - if (chunk > 0) { - // round up to make sure the chunk is enough to cover all iterations - T tripCount = ub - lb + 1; // +1 because ub is inclusive - T span = (tripCount + numberOfActiveOMPThreads - 1) / - numberOfActiveOMPThreads; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - if (ub > oldUb) - ub = oldUb; - break; - } - } // note: if chunk <=0, use nochunk - case kmp_sched_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - case kmp_sched_distr_static_chunk: { - if (chunk > 0) { - ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), - GetNumberOfOmpTeams()); - break; - } // note: if chunk <=0, use nochunk - } - case kmp_sched_distr_static_nochunk: { - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), - GetNumberOfOmpTeams()); - break; - } - case kmp_sched_distr_static_chunk_sched_static_chunkone: { - ForStaticChunk(lastiter, lb, ub, stride, chunk, - numberOfActiveOMPThreads * GetOmpTeamId() + gtid, - GetNumberOfOmpTeams() * numberOfActiveOMPThreads); - break; - } - default: { - ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); - PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", - (int)schedtype); - ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, - numberOfActiveOMPThreads); - break; - } - } - // copy back - *plastiter = lastiter; - *plower = lb; - *pupper = ub; - *pstride = stride; - PRINT(LD_LOOP, - "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last " - "%d\n", - (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(), - (long long)(*plower), (long long)(*pupper), (long long)(*pstride), - (int)lastiter); - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch Init - - INLINE static int OrderedSchedule(kmp_sched_t schedule) { - return schedule >= kmp_sched_ordered_first && - schedule <= kmp_sched_ordered_last; - } - - INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId, - kmp_sched_t schedule, T lb, T ub, ST st, - ST chunk) { - if (checkRuntimeUninitialized(loc)) { - // In SPMD mode no need to check parallelism level - dynamic scheduling - // may appear only in L2 parallel regions with lightweight runtime. - ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode."); - return; - } - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); - T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc)); - T tripCount = ub - lb + 1; // +1 because ub is inclusive - ASSERT0(LT_FUSSY, threadId < tnum, - "current thread is not needed here; error"); - - /* Currently just ignore the monotonic and non-monotonic modifiers - * (the compiler isn't producing them * yet anyway). - * When it is we'll want to look at them somewhere here and use that - * information to add to our schedule choice. We shouldn't need to pass - * them on, they merely affect which schedule we can legally choose for - * various dynamic cases. (In particular, whether or not a stealing scheme - * is legal). - */ - schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); - - // Process schedule. - if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { - if (OrderedSchedule(schedule)) - __kmpc_barrier(loc, threadId); - PRINT(LD_LOOP, - "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", - (long)tnum, (long long)tripCount, (int)schedule); - schedule = kmp_sched_static_chunk; - chunk = tripCount; // one thread gets the whole loop - } else if (schedule == kmp_sched_runtime) { - // process runtime - omp_sched_t rtSched = currTaskDescr->GetRuntimeSched(); - chunk = currTaskDescr->RuntimeChunkSize(); - switch (rtSched) { - case omp_sched_static: { - if (chunk > 0) - schedule = kmp_sched_static_chunk; - else - schedule = kmp_sched_static_nochunk; - break; - } - case omp_sched_auto: { - schedule = kmp_sched_static_chunk; - chunk = 1; - break; - } - case omp_sched_dynamic: - case omp_sched_guided: { - schedule = kmp_sched_dynamic; - break; - } - } - PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule, - (long long)chunk); - } else if (schedule == kmp_sched_auto) { - schedule = kmp_sched_static_chunk; - chunk = 1; - PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule, - (long long)chunk); - } else { - PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule, - (long long)chunk); - ASSERT(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "unknown schedule %d & chunk %lld\n", (int)schedule, - (long long)chunk); - } - - // init schedules - if (schedule == kmp_sched_static_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; - PRINT(LD_LOOP, - "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 - ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( - tid)); - } else if (schedule == kmp_sched_static_balanced_chunk) { - ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // compute static chunk - ST stride; - int lastiter = 0; - // round up to make sure the chunk is enough to cover all iterations - T span = (tripCount + tnum - 1) / tnum; - // perform chunk adjustment - chunk = (span + chunk - 1) & ~(chunk - 1); - - T oldUb = ub; - ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); - if (ub > oldUb) - ub = oldUb; - // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; - PRINT(LD_LOOP, - "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 - ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( - tid)); - } else if (schedule == kmp_sched_static_nochunk) { - ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); - // save sched state - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - // save ub - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - // compute static chunk - ST stride; - int lastiter = 0; - ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); - // save computed params - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; - PRINT(LD_LOOP, - "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 - ", next lower bound = %llu, stride = %llu\n", - (int)tnum, - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( - tid)); - } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { - // save data - omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; - if (chunk < 1) - chunk = 1; - omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; - __kmpc_barrier(loc, threadId); - if (tid == 0) { - omptarget_nvptx_threadPrivateContext->Cnt() = 0; - __kmpc_impl_threadfence_block(); - } - __kmpc_barrier(loc, threadId); - PRINT(LD_LOOP, - "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 - ", chunk %" PRIu64 "\n", - (int)tnum, - (unsigned long long) - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), - omptarget_nvptx_threadPrivateContext->Chunk(tid)); - } - } - - //////////////////////////////////////////////////////////////////////////////// - // Support for dispatch next - - INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val, - int leader) { - uint32_t lo, hi; - __kmpc_impl_unpack(val, lo, hi); - hi = __kmpc_impl_shfl_sync(active, hi, leader); - lo = __kmpc_impl_shfl_sync(active, lo, leader); - return __kmpc_impl_pack(lo, hi); - } - - INLINE static uint64_t NextIter() { - __kmpc_impl_lanemask_t active = __kmpc_impl_activemask(); - uint32_t leader = __kmpc_impl_ffs(active) - 1; - uint32_t change = __kmpc_impl_popc(active); - __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); - unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt); - uint64_t warp_res; - if (rank == 0) { - warp_res = __kmpc_atomic_add( - (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(), - (unsigned long long)change); - } - warp_res = Shuffle(active, warp_res, leader); - return warp_res + rank; - } - - INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, - T loopLowerBound, T loopUpperBound) { - T N = NextIter(); - lb = loopLowerBound + N * chunkSize; - ub = lb + chunkSize - 1; // Clang uses i <= ub - - // 3 result cases: - // a. lb and ub < loopUpperBound --> NOT_FINISHED - // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> - // NOT_FINISHED - // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED - // a. - if (lb <= loopUpperBound && ub < loopUpperBound) { - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", - (long long)lb, (long long)ub, (long long)loopUpperBound); - return NOT_FINISHED; - } - // b. - if (lb <= loopUpperBound) { - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", - (long long)lb, (long long)ub, (long long)loopUpperBound); - ub = loopUpperBound; - return LAST_CHUNK; - } - // c. if we are here, we are in case 'c' - lb = loopUpperBound + 2; - ub = loopUpperBound + 1; - PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb, - (long long)ub, (long long)loopUpperBound); - return FINISHED; - } - - INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, - T *plower, T *pupper, ST *pstride) { - if (checkRuntimeUninitialized(loc)) { - // In SPMD mode no need to check parallelism level - dynamic scheduling - // may appear only in L2 parallel regions with lightweight runtime. - ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode."); - if (*plast) - return DISPATCH_FINISHED; - *plast = 1; - return DISPATCH_NOTFINISHED; - } - // ID of a thread in its own warp - - // automatically selects thread or warp ID based on selected implementation - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)), - "current thread is not needed here; error"); - // retrieve schedule - kmp_sched_t schedule = - omptarget_nvptx_threadPrivateContext->ScheduleType(tid); - - // xxx reduce to one - if (schedule == kmp_sched_static_chunk || - schedule == kmp_sched_static_nochunk) { - T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid); - T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid); - // finished? - if (myLb > ub) { - PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", - (long long)myLb, (long long)ub); - return DISPATCH_FINISHED; - } - // not finished, save current bounds - ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid); - *plower = myLb; - T myUb = myLb + chunk - 1; // Clang uses i <= ub - if (myUb > ub) - myUb = ub; - *pupper = myUb; - *plast = (int32_t)(myUb == ub); - - // increment next lower bound by the stride - ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid); - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; - PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", - (long long)*plower, (long long)*pupper); - return DISPATCH_NOTFINISHED; - } - ASSERT0(LT_FUSSY, - schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, - "bad sched"); - T myLb, myUb; - int finished = DynamicNextChunk( - myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid), - omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), - omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); - - if (finished == FINISHED) - return DISPATCH_FINISHED; - - // not finished (either not finished or last chunk) - *plast = (int32_t)(finished == LAST_CHUNK); - *plower = myLb; - *pupper = myUb; - *pstride = 1; - - PRINT(LD_LOOP, - "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " - "last %d\n", - (int)GetNumberOfOmpThreads(isSPMDMode()), - (int)GetNumberOfWorkersInTeam(), (long long)*plower, - (long long)*pupper, (long long)*pstride, (int)*plast); - return DISPATCH_NOTFINISHED; - } - - INLINE static void dispatch_fini() { - // nothing - } - - //////////////////////////////////////////////////////////////////////////////// - // end of template class that encapsulate all the helper functions - //////////////////////////////////////////////////////////////////////////////// -}; - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (dyn loops) -//////////////////////////////////////////////////////////////////////////////// - -// init -EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, - int32_t schedule, int32_t lb, int32_t ub, - int32_t st, int32_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, - int32_t schedule, uint32_t lb, uint32_t ub, - int32_t st, int32_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, - int32_t schedule, int64_t lb, int64_t ub, - int64_t st, int64_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, - int32_t schedule, uint64_t lb, uint64_t ub, - int64_t st, int64_t chunk) { - PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); - omptarget_nvptx_LoopSupport::dispatch_init( - loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); -} - -// next -EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, - int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, - int32_t *p_last, uint32_t *p_lb, - uint32_t *p_ub, int32_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, - int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, - int32_t *p_last, uint64_t *p_lb, - uint64_t *p_ub, int64_t *p_st) { - PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); - return omptarget_nvptx_LoopSupport::dispatch_next( - loc, tid, p_last, p_lb, p_ub, p_st); -} - -// fini -EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); - omptarget_nvptx_LoopSupport::dispatch_fini(); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP interface implementation (static loops) -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc)); -} - -EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc)); -} - -EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc)); -} - -EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - checkSPMDMode(loc)); -} - -EXTERN -void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, uint32_t *plower, - uint32_t *pupper, int32_t *pstride, - int32_t incr, int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t schedtype, - int32_t *plastiter, uint64_t *plower, - uint64_t *pupper, int64_t *pstride, - int64_t incr, int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/true); -} - -EXTERN -void __kmpc_for_static_init_4_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN -void __kmpc_for_static_init_4u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, - int32_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN -void __kmpc_for_static_init_8_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN -void __kmpc_for_static_init_8u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, - uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, - int64_t chunk) { - PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); - omptarget_nvptx_LoopSupport::for_static_init( - global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, - /*IsSPMDExecutionMode=*/false); -} - -EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_for_static_fini\n"); -} - -namespace { -INLINE void syncWorkersInGenericMode(uint32_t NumThreads) { - int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp (at least, all present for the kernel launch) participate in the - // barrier. This is enforced when launching the parallel region. An - // exception is when there are < WARPSIZE workers. In this case only 1 worker - // is started, so we don't need a barrier. - if (NumThreads > 1) { -#endif - __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps); -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - } -#endif -} -}; // namespace - -EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid, - int32_t varNum, void *array) { - PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n"); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), - "Expected non-SPMD mode + initialized runtime."); - - omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); - uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc)); - uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); - for (unsigned i = 0; i < varNum; i++) { - // Reset buffer. - if (gtid == 0) - *Buffer = 0; // Reset to minimum loop iteration value. - - // Barrier. - syncWorkersInGenericMode(NumThreads); - - // Atomic max of iterations. - uint64_t *varArray = (uint64_t *)array; - uint64_t elem = varArray[i]; - (void)__kmpc_atomic_max((unsigned long long int *)Buffer, - (unsigned long long int)elem); - - // Barrier. - syncWorkersInGenericMode(NumThreads); - - // Read max value and update thread private array. - varArray[i] = *Buffer; - - // Barrier. - syncWorkersInGenericMode(NumThreads); - } -} +//===------------ loop.cu - NVPTX OpenMP loop constructs --------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of the KMPC interface +// for the loop construct plus other worksharing constructs that use the same +// interface as loops. +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" +#include "target_impl.h" +#include "common/target_atomic.h" + +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// +// template class that encapsulate all the helper functions +// +// T is loop iteration type (32 | 64) (unsigned | signed) +// ST is the signed version of T +//////////////////////////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////////////////////////// + +template class omptarget_nvptx_LoopSupport { +public: + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling with chunk + + // Generic implementation of OMP loop scheduling with static policy + /*! \brief Calculate initial bounds for static loop and stride + * @param[in] loc location in code of the call (not used here) + * @param[in] global_tid global thread id + * @param[in] schetype type of scheduling (see omptarget-nvptx.h) + * @param[in] plastiter pointer to last iteration + * @param[in,out] pointer to loop lower bound. it will contain value of + * lower bound of first chunk + * @param[in,out] pointer to loop upper bound. It will contain value of + * upper bound of first chunk + * @param[in,out] pointer to loop stride. It will contain value of stride + * between two successive chunks executed by the same thread + * @param[in] loop increment bump + * @param[in] chunk size + */ + + // helper function for static chunk + INLINE static void ForStaticChunk(int &last, T &lb, T &ub, ST &stride, + ST chunk, T entityId, T numberOfEntities) { + // each thread executes multiple chunks all of the same size, except + // the last one + + // distance between two successive chunks + stride = numberOfEntities * chunk; + lb = lb + entityId * chunk; + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + // Say ub' is the begining of the last chunk. Then who ever has a + // lower bound plus a multiple of the increment equal to ub' is + // the last one. + T beginingLastChunk = inputUb - (inputUb % chunk); + last = ((beginingLastChunk - lb) % stride) == 0; + } + + //////////////////////////////////////////////////////////////////////////////// + // Loop with static scheduling without chunk + + // helper function for static no chunk + INLINE static void ForStaticNoChunk(int &last, T &lb, T &ub, ST &stride, + ST &chunk, T entityId, + T numberOfEntities) { + // No chunk size specified. Each thread or warp gets at most one + // chunk; chunks are all almost of equal size + T loopSize = ub - lb + 1; + + chunk = loopSize / numberOfEntities; + T leftOver = loopSize - chunk * numberOfEntities; + + if (entityId < leftOver) { + chunk++; + lb = lb + entityId * chunk; + } else { + lb = lb + entityId * chunk + leftOver; + } + + T inputUb = ub; + ub = lb + chunk - 1; // Clang uses i <= ub + last = lb <= inputUb && inputUb <= ub; + stride = loopSize; // make sure we only do 1 chunk per warp + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for Static Init + + INLINE static void for_static_init(int32_t gtid, int32_t schedtype, + int32_t *plastiter, T *plower, T *pupper, + ST *pstride, ST chunk, + bool IsSPMDExecutionMode) { + // When IsRuntimeUninitialized is true, we assume that the caller is + // in an L0 parallel region and that all worker threads participate. + + // Assume we are in teams region or that we use a single block + // per target region + ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode); + + // All warps that are in excess of the maximum requested, do + // not execute the loop + PRINT(LD_LOOP, + "OMP Thread %d: schedule type %d, chunk size = %lld, mytid " + "%d, num tids %d\n", + (int)gtid, (int)schedtype, (long long)chunk, (int)gtid, + (int)numberOfActiveOMPThreads); + ASSERT0(LT_FUSSY, gtid < numberOfActiveOMPThreads, + "current thread is not needed here; error"); + + // copy + int lastiter = 0; + T lb = *plower; + T ub = *pupper; + ST stride = *pstride; + // init + switch (SCHEDULE_WITHOUT_MODIFIERS(schedtype)) { + case kmp_sched_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_balanced_chunk: { + if (chunk > 0) { + // round up to make sure the chunk is enough to cover all iterations + T tripCount = ub - lb + 1; // +1 because ub is inclusive + T span = (tripCount + numberOfActiveOMPThreads - 1) / + numberOfActiveOMPThreads; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + if (ub > oldUb) + ub = oldUb; + break; + } + } // note: if chunk <=0, use nochunk + case kmp_sched_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + case kmp_sched_distr_static_chunk: { + if (chunk > 0) { + ForStaticChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), + GetNumberOfOmpTeams()); + break; + } // note: if chunk <=0, use nochunk + } + case kmp_sched_distr_static_nochunk: { + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, GetOmpTeamId(), + GetNumberOfOmpTeams()); + break; + } + case kmp_sched_distr_static_chunk_sched_static_chunkone: { + ForStaticChunk(lastiter, lb, ub, stride, chunk, + numberOfActiveOMPThreads * GetOmpTeamId() + gtid, + GetNumberOfOmpTeams() * numberOfActiveOMPThreads); + break; + } + default: { + ASSERT(LT_FUSSY, 0, "unknown schedtype %d", (int)schedtype); + PRINT(LD_LOOP, "unknown schedtype %d, revert back to static chunk\n", + (int)schedtype); + ForStaticChunk(lastiter, lb, ub, stride, chunk, gtid, + numberOfActiveOMPThreads); + break; + } + } + // copy back + *plastiter = lastiter; + *plower = lb; + *pupper = ub; + *pstride = stride; + PRINT(LD_LOOP, + "Got sched: Active %d, total %d: lb %lld, ub %lld, stride %lld, last " + "%d\n", + (int)numberOfActiveOMPThreads, (int)GetNumberOfWorkersInTeam(), + (long long)(*plower), (long long)(*pupper), (long long)(*pstride), + (int)lastiter); + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch Init + + INLINE static int OrderedSchedule(kmp_sched_t schedule) { + return schedule >= kmp_sched_ordered_first && + schedule <= kmp_sched_ordered_last; + } + + INLINE static void dispatch_init(kmp_Ident *loc, int32_t threadId, + kmp_sched_t schedule, T lb, T ub, ST st, + ST chunk) { + if (checkRuntimeUninitialized(loc)) { + // In SPMD mode no need to check parallelism level - dynamic scheduling + // may appear only in L2 parallel regions with lightweight runtime. + ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode."); + return; + } + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid); + T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc)); + T tripCount = ub - lb + 1; // +1 because ub is inclusive + ASSERT0(LT_FUSSY, threadId < tnum, + "current thread is not needed here; error"); + + /* Currently just ignore the monotonic and non-monotonic modifiers + * (the compiler isn't producing them * yet anyway). + * When it is we'll want to look at them somewhere here and use that + * information to add to our schedule choice. We shouldn't need to pass + * them on, they merely affect which schedule we can legally choose for + * various dynamic cases. (In particular, whether or not a stealing scheme + * is legal). + */ + schedule = SCHEDULE_WITHOUT_MODIFIERS(schedule); + + // Process schedule. + if (tnum == 1 || tripCount <= 1 || OrderedSchedule(schedule)) { + if (OrderedSchedule(schedule)) + __kmpc_barrier(loc, threadId); + PRINT(LD_LOOP, + "go sequential as tnum=%ld, trip count %lld, ordered sched=%d\n", + (long)tnum, (long long)tripCount, (int)schedule); + schedule = kmp_sched_static_chunk; + chunk = tripCount; // one thread gets the whole loop + } else if (schedule == kmp_sched_runtime) { + // process runtime + omp_sched_t rtSched = currTaskDescr->GetRuntimeSched(); + chunk = currTaskDescr->RuntimeChunkSize(); + switch (rtSched) { + case omp_sched_static: { + if (chunk > 0) + schedule = kmp_sched_static_chunk; + else + schedule = kmp_sched_static_nochunk; + break; + } + case omp_sched_auto: { + schedule = kmp_sched_static_chunk; + chunk = 1; + break; + } + case omp_sched_dynamic: + case omp_sched_guided: { + schedule = kmp_sched_dynamic; + break; + } + } + PRINT(LD_LOOP, "Runtime sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + } else if (schedule == kmp_sched_auto) { + schedule = kmp_sched_static_chunk; + chunk = 1; + PRINT(LD_LOOP, "Auto sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + } else { + PRINT(LD_LOOP, "Dyn sched is %d with chunk %lld\n", (int)schedule, + (long long)chunk); + ASSERT(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "unknown schedule %d & chunk %lld\n", (int)schedule, + (long long)chunk); + } + + // init schedules + if (schedule == kmp_sched_static_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_static_balanced_chunk) { + ASSERT0(LT_FUSSY, chunk > 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + int lastiter = 0; + // round up to make sure the chunk is enough to cover all iterations + T span = (tripCount + tnum - 1) / tnum; + // perform chunk adjustment + chunk = (span + chunk - 1) & ~(chunk - 1); + + T oldUb = ub; + ForStaticChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + ASSERT0(LT_FUSSY, ub >= lb, "ub must be >= lb."); + if (ub > oldUb) + ub = oldUb; + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static chunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_static_nochunk) { + ASSERT0(LT_FUSSY, chunk == 0, "bad chunk value"); + // save sched state + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + // save ub + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + // compute static chunk + ST stride; + int lastiter = 0; + ForStaticNoChunk(lastiter, lb, ub, stride, chunk, threadId, tnum); + // save computed params + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + omptarget_nvptx_threadPrivateContext->Stride(tid) = stride; + PRINT(LD_LOOP, + "dispatch init (static nochunk) : num threads = %d, ub = %" PRId64 + ", next lower bound = %llu, stride = %llu\n", + (int)tnum, + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + (unsigned long long)omptarget_nvptx_threadPrivateContext->Stride( + tid)); + } else if (schedule == kmp_sched_dynamic || schedule == kmp_sched_guided) { + // save data + omptarget_nvptx_threadPrivateContext->ScheduleType(tid) = schedule; + if (chunk < 1) + chunk = 1; + omptarget_nvptx_threadPrivateContext->Chunk(tid) = chunk; + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid) = ub; + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = lb; + __kmpc_barrier(loc, threadId); + if (tid == 0) { + omptarget_nvptx_threadPrivateContext->Cnt() = 0; + __kmpc_impl_threadfence_block(); + } + __kmpc_barrier(loc, threadId); + PRINT(LD_LOOP, + "dispatch init (dyn) : num threads = %d, lb = %llu, ub = %" PRId64 + ", chunk %" PRIu64 "\n", + (int)tnum, + (unsigned long long) + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid), + omptarget_nvptx_threadPrivateContext->Chunk(tid)); + } + } + + //////////////////////////////////////////////////////////////////////////////// + // Support for dispatch next + + INLINE static uint64_t Shuffle(__kmpc_impl_lanemask_t active, int64_t val, + int leader) { + uint32_t lo, hi; + __kmpc_impl_unpack(val, lo, hi); + hi = __kmpc_impl_shfl_sync(active, hi, leader); + lo = __kmpc_impl_shfl_sync(active, lo, leader); + return __kmpc_impl_pack(lo, hi); + } + + INLINE static uint64_t NextIter() { + __kmpc_impl_lanemask_t active = __kmpc_impl_activemask(); + uint32_t leader = __kmpc_impl_ffs(active) - 1; + uint32_t change = __kmpc_impl_popc(active); + __kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt(); + unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt); + uint64_t warp_res; + if (rank == 0) { + warp_res = __kmpc_atomic_add( + (unsigned long long *)&omptarget_nvptx_threadPrivateContext->Cnt(), + (unsigned long long)change); + } + warp_res = Shuffle(active, warp_res, leader); + return warp_res + rank; + } + + INLINE static int DynamicNextChunk(T &lb, T &ub, T chunkSize, + T loopLowerBound, T loopUpperBound) { + T N = NextIter(); + lb = loopLowerBound + N * chunkSize; + ub = lb + chunkSize - 1; // Clang uses i <= ub + + // 3 result cases: + // a. lb and ub < loopUpperBound --> NOT_FINISHED + // b. lb < loopUpperBound and ub >= loopUpperBound: last chunk --> + // NOT_FINISHED + // c. lb and ub >= loopUpperBound: empty chunk --> FINISHED + // a. + if (lb <= loopUpperBound && ub < loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; not finished\n", + (long long)lb, (long long)ub, (long long)loopUpperBound); + return NOT_FINISHED; + } + // b. + if (lb <= loopUpperBound) { + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; clip to loop ub\n", + (long long)lb, (long long)ub, (long long)loopUpperBound); + ub = loopUpperBound; + return LAST_CHUNK; + } + // c. if we are here, we are in case 'c' + lb = loopUpperBound + 2; + ub = loopUpperBound + 1; + PRINT(LD_LOOPD, "lb %lld, ub %lld, loop ub %lld; finished\n", (long long)lb, + (long long)ub, (long long)loopUpperBound); + return FINISHED; + } + + INLINE static int dispatch_next(kmp_Ident *loc, int32_t gtid, int32_t *plast, + T *plower, T *pupper, ST *pstride) { + if (checkRuntimeUninitialized(loc)) { + // In SPMD mode no need to check parallelism level - dynamic scheduling + // may appear only in L2 parallel regions with lightweight runtime. + ASSERT0(LT_FUSSY, checkSPMDMode(loc), "Expected non-SPMD mode."); + if (*plast) + return DISPATCH_FINISHED; + *plast = 1; + return DISPATCH_NOTFINISHED; + } + // ID of a thread in its own warp + + // automatically selects thread or warp ID based on selected implementation + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)), + "current thread is not needed here; error"); + // retrieve schedule + kmp_sched_t schedule = + omptarget_nvptx_threadPrivateContext->ScheduleType(tid); + + // xxx reduce to one + if (schedule == kmp_sched_static_chunk || + schedule == kmp_sched_static_nochunk) { + T myLb = omptarget_nvptx_threadPrivateContext->NextLowerBound(tid); + T ub = omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid); + // finished? + if (myLb > ub) { + PRINT(LD_LOOP, "static loop finished with myLb %lld, ub %lld\n", + (long long)myLb, (long long)ub); + return DISPATCH_FINISHED; + } + // not finished, save current bounds + ST chunk = omptarget_nvptx_threadPrivateContext->Chunk(tid); + *plower = myLb; + T myUb = myLb + chunk - 1; // Clang uses i <= ub + if (myUb > ub) + myUb = ub; + *pupper = myUb; + *plast = (int32_t)(myUb == ub); + + // increment next lower bound by the stride + ST stride = omptarget_nvptx_threadPrivateContext->Stride(tid); + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid) = myLb + stride; + PRINT(LD_LOOP, "static loop continues with myLb %lld, myUb %lld\n", + (long long)*plower, (long long)*pupper); + return DISPATCH_NOTFINISHED; + } + ASSERT0(LT_FUSSY, + schedule == kmp_sched_dynamic || schedule == kmp_sched_guided, + "bad sched"); + T myLb, myUb; + int finished = DynamicNextChunk( + myLb, myUb, omptarget_nvptx_threadPrivateContext->Chunk(tid), + omptarget_nvptx_threadPrivateContext->NextLowerBound(tid), + omptarget_nvptx_threadPrivateContext->LoopUpperBound(tid)); + + if (finished == FINISHED) + return DISPATCH_FINISHED; + + // not finished (either not finished or last chunk) + *plast = (int32_t)(finished == LAST_CHUNK); + *plower = myLb; + *pupper = myUb; + *pstride = 1; + + PRINT(LD_LOOP, + "Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, " + "last %d\n", + (int)GetNumberOfOmpThreads(isSPMDMode()), + (int)GetNumberOfWorkersInTeam(), (long long)*plower, + (long long)*pupper, (long long)*pstride, (int)*plast); + return DISPATCH_NOTFINISHED; + } + + INLINE static void dispatch_fini() { + // nothing + } + + //////////////////////////////////////////////////////////////////////////////// + // end of template class that encapsulate all the helper functions + //////////////////////////////////////////////////////////////////////////////// +}; + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (dyn loops) +//////////////////////////////////////////////////////////////////////////////// + +// init +EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t tid, + int32_t schedule, int32_t lb, int32_t ub, + int32_t st, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t tid, + int32_t schedule, uint32_t lb, uint32_t ub, + int32_t st, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_4u\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t tid, + int32_t schedule, int64_t lb, int64_t ub, + int64_t st, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t tid, + int32_t schedule, uint64_t lb, uint64_t ub, + int64_t st, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_dispatch_init_8u\n"); + omptarget_nvptx_LoopSupport::dispatch_init( + loc, tid, (kmp_sched_t)schedule, lb, ub, st, chunk); +} + +// next +EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t tid, int32_t *p_last, + int32_t *p_lb, int32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t tid, + int32_t *p_last, uint32_t *p_lb, + uint32_t *p_ub, int32_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_4u\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t tid, int32_t *p_last, + int64_t *p_lb, int64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t tid, + int32_t *p_last, uint64_t *p_lb, + uint64_t *p_ub, int64_t *p_st) { + PRINT0(LD_IO, "call kmpc_dispatch_next_8u\n"); + return omptarget_nvptx_LoopSupport::dispatch_next( + loc, tid, p_last, p_lb, p_ub, p_st); +} + +// fini +EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_4u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_dispatch_fini_8u\n"); + omptarget_nvptx_LoopSupport::dispatch_fini(); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP interface implementation (static loops) +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc)); +} + +EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc)); +} + +EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc)); +} + +EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + checkSPMDMode(loc)); +} + +EXTERN +void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +EXTERN +void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride, + int32_t incr, int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +EXTERN +void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +EXTERN +void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t schedtype, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride, + int64_t incr, int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_spmd\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/true); +} + +EXTERN +void __kmpc_for_static_init_4_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + int32_t *plower, int32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +EXTERN +void __kmpc_for_static_init_4u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_4u_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +EXTERN +void __kmpc_for_static_init_8_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + int64_t *plower, int64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +EXTERN +void __kmpc_for_static_init_8u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t schedtype, int32_t *plastiter, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk) { + PRINT0(LD_IO, "call kmpc_for_static_init_8u_simple_generic\n"); + omptarget_nvptx_LoopSupport::for_static_init( + global_tid, schedtype, plastiter, plower, pupper, pstride, chunk, + /*IsSPMDExecutionMode=*/false); +} + +EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_for_static_fini\n"); +} + +namespace { +INLINE void syncWorkersInGenericMode(uint32_t NumThreads) { + int NumWarps = ((NumThreads + WARPSIZE - 1) / WARPSIZE); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp (at least, all present for the kernel launch) participate in the + // barrier. This is enforced when launching the parallel region. An + // exception is when there are < WARPSIZE workers. In this case only 1 worker + // is started, so we don't need a barrier. + if (NumThreads > 1) { +#endif + __kmpc_impl_named_sync(L1_BARRIER, WARPSIZE * NumWarps); +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + } +#endif +} +}; // namespace + +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid, + int32_t varNum, void *array) { + PRINT0(LD_IO, "call to __kmpc_reduce_conditional_lastprivate(...)\n"); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Expected non-SPMD mode + initialized runtime."); + + omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor(); + uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc)); + uint64_t *Buffer = teamDescr.getLastprivateIterBuffer(); + for (unsigned i = 0; i < varNum; i++) { + // Reset buffer. + if (gtid == 0) + *Buffer = 0; // Reset to minimum loop iteration value. + + // Barrier. + syncWorkersInGenericMode(NumThreads); + + // Atomic max of iterations. + uint64_t *varArray = (uint64_t *)array; + uint64_t elem = varArray[i]; + (void)__kmpc_atomic_max((unsigned long long int *)Buffer, + (unsigned long long int)elem); + + // Barrier. + syncWorkersInGenericMode(NumThreads); + + // Read max value and update thread private array. + varArray[i] = *Buffer; + + // Barrier. + syncWorkersInGenericMode(NumThreads); + } +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu index 5bef3b89a1721..f335dac5484a0 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omp_data.cu @@ -1,68 +1,68 @@ -//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the data objects used on the GPU device. -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" -#include "common/device_environment.h" - -//////////////////////////////////////////////////////////////////////////////// -// global device environment -//////////////////////////////////////////////////////////////////////////////// - -DEVICE omptarget_device_environmentTy omptarget_device_environment; - -//////////////////////////////////////////////////////////////////////////////// -// global data holding OpenMP state information -//////////////////////////////////////////////////////////////////////////////// - -DEVICE - omptarget_nvptx_Queue - omptarget_nvptx_device_State[MAX_SM]; - -DEVICE omptarget_nvptx_SimpleMemoryManager - omptarget_nvptx_simpleMemoryManager; -DEVICE SHARED uint32_t usedMemIdx; -DEVICE SHARED uint32_t usedSlotIdx; - -DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; -DEVICE SHARED uint16_t threadLimit; -DEVICE SHARED uint16_t threadsInTeam; -DEVICE SHARED uint16_t nThreads; -// Pointer to this team's OpenMP state object -DEVICE SHARED - omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; - -//////////////////////////////////////////////////////////////////////////////// -// The team master sets the outlined parallel function in this variable to -// communicate with the workers. Since it is in shared memory, there is one -// copy of these variables for each kernel, instance, and team. -//////////////////////////////////////////////////////////////////////////////// -volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; - -//////////////////////////////////////////////////////////////////////////////// -// OpenMP kernel execution parameters -//////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED uint32_t execution_param; - -//////////////////////////////////////////////////////////////////////////////// -// Data sharing state -//////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED DataSharingStateTy DataSharingState; - -//////////////////////////////////////////////////////////////////////////////// -// Scratchpad for teams reduction. -//////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED void *ReductionScratchpadPtr; - -//////////////////////////////////////////////////////////////////////////////// -// Data sharing related variables. -//////////////////////////////////////////////////////////////////////////////// -DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; +//===------------ omp_data.cu - OpenMP GPU objects --------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the data objects used on the GPU device. +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" +#include "common/device_environment.h" + +//////////////////////////////////////////////////////////////////////////////// +// global device environment +//////////////////////////////////////////////////////////////////////////////// + +DEVICE omptarget_device_environmentTy omptarget_device_environment; + +//////////////////////////////////////////////////////////////////////////////// +// global data holding OpenMP state information +//////////////////////////////////////////////////////////////////////////////// + +DEVICE + omptarget_nvptx_Queue + omptarget_nvptx_device_State[MAX_SM]; + +DEVICE omptarget_nvptx_SimpleMemoryManager + omptarget_nvptx_simpleMemoryManager; +DEVICE SHARED uint32_t usedMemIdx; +DEVICE SHARED uint32_t usedSlotIdx; + +DEVICE SHARED uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE]; +DEVICE SHARED uint16_t threadLimit; +DEVICE SHARED uint16_t threadsInTeam; +DEVICE SHARED uint16_t nThreads; +// Pointer to this team's OpenMP state object +DEVICE SHARED + omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext; + +//////////////////////////////////////////////////////////////////////////////// +// The team master sets the outlined parallel function in this variable to +// communicate with the workers. Since it is in shared memory, there is one +// copy of these variables for each kernel, instance, and team. +//////////////////////////////////////////////////////////////////////////////// +volatile DEVICE SHARED omptarget_nvptx_WorkFn omptarget_nvptx_workFn; + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP kernel execution parameters +//////////////////////////////////////////////////////////////////////////////// +DEVICE SHARED uint32_t execution_param; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing state +//////////////////////////////////////////////////////////////////////////////// +DEVICE SHARED DataSharingStateTy DataSharingState; + +//////////////////////////////////////////////////////////////////////////////// +// Scratchpad for teams reduction. +//////////////////////////////////////////////////////////////////////////////// +DEVICE SHARED void *ReductionScratchpadPtr; + +//////////////////////////////////////////////////////////////////////////////// +// Data sharing related variables. +//////////////////////////////////////////////////////////////////////////////// +DEVICE SHARED omptarget_nvptx_SharedArgs omptarget_nvptx_globalArgs; diff --git a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu index 23fbd00cacaf9..305ff626699a1 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/omptarget.cu @@ -1,179 +1,179 @@ -//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the initialization code for the GPU -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// global data tables -//////////////////////////////////////////////////////////////////////////////// - -extern DEVICE - omptarget_nvptx_Queue - omptarget_nvptx_device_State[MAX_SM]; - -//////////////////////////////////////////////////////////////////////////////// -// init entry points -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_kernel_init_params(void *Ptr) { - PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n", - OMPTARGET_NVPTX_VERSION); - - SetTeamsReductionScratchpadPtr(Ptr); -} - -EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { - PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", - OMPTARGET_NVPTX_VERSION); - ASSERT0(LT_FUSSY, RequiresOMPRuntime, - "Generic always requires initialized runtime."); - setExecutionParameters(Generic, RuntimeInitialized); - for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) - parallelLevel[I] = 0; - - int threadIdInBlock = GetThreadIdInBlock(); - ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), - "__kmpc_kernel_init() must be called by team master warp only!"); - PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); - - // Get a state object from the queue. - int slot = __kmpc_impl_smid() % MAX_SM; - usedSlotIdx = slot; - omptarget_nvptx_threadPrivateContext = - omptarget_nvptx_device_State[slot].Dequeue(); - - // init thread private - int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); - omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); - - // init team context - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - currTeamDescr.InitTeamDescr(); - // this thread will start execution... has to update its task ICV - // to point to the level zero task ICV. That ICV was init in - // InitTeamDescr() - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, currTeamDescr.LevelZeroTaskDescr()); - - // set number of threads and thread limit in team to started value - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - nThreads = GetNumberOfThreadsInBlock(); - threadLimit = ThreadLimit; -} - -EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { - PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n"); - ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, - "Generic always requires initialized runtime."); - // Enqueue omp state object for use by another team. - int slot = usedSlotIdx; - omptarget_nvptx_device_State[slot].Enqueue( - omptarget_nvptx_threadPrivateContext); - // Done with work. Kill the workers. - omptarget_nvptx_workFn = 0; -} - -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, - int16_t RequiresDataSharing) { - PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); - - setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized - : RuntimeUninitialized); - int threadId = GetThreadIdInBlock(); - if (threadId == 0) { - usedSlotIdx = __kmpc_impl_smid() % MAX_SM; - parallelLevel[0] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); - } else if (GetLaneId() == 0) { - parallelLevel[GetWarpId()] = - 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); - } - if (!RequiresOMPRuntime) { - // Runtime is not required - exit. - __kmpc_impl_syncthreads(); - return; - } - - // - // Team Context Initialization. - // - // In SPMD mode there is no master thread so use any cuda thread for team - // context initialization. - if (threadId == 0) { - // Get a state object from the queue. - omptarget_nvptx_threadPrivateContext = - omptarget_nvptx_device_State[usedSlotIdx].Dequeue(); - - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - // init team context - currTeamDescr.InitTeamDescr(); - } - __kmpc_impl_syncthreads(); - - omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - - // - // Initialize task descr for each thread. - // - omptarget_nvptx_TaskDescr *newTaskDescr = - omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); - newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr()); - // install new top descriptor - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); - - // init thread private from init value - PRINT(LD_PAR, - "thread will execute parallel region with id %d in a team of " - "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)ThreadLimit); - - if (RequiresDataSharing && GetLaneId() == 0) { - // Warp master initializes data sharing environment. - unsigned WID = threadId / WARPSIZE; - __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS( - WID, WID == WARPSIZE - 1); - DataSharingState.SlotPtr[WID] = RootS; - DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; - } -} - -EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() { - __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized()); -} - -EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { - // We're not going to pop the task descr stack of each thread since - // there are no more parallel regions in SPMD mode. - if (!RequiresOMPRuntime) - return; - - __kmpc_impl_syncthreads(); - int threadId = GetThreadIdInBlock(); - if (threadId == 0) { - // Enqueue omp state object for use by another team. - int slot = usedSlotIdx; - omptarget_nvptx_device_State[slot].Enqueue( - omptarget_nvptx_threadPrivateContext); - } -} - -// Return true if the current target region is executed in SPMD mode. -EXTERN int8_t __kmpc_is_spmd_exec_mode() { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n"); - return isSPMDMode(); -} +//===--- omptarget.cu - OpenMP GPU initialization ---------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the initialization code for the GPU +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" +#include "target_impl.h" + +//////////////////////////////////////////////////////////////////////////////// +// global data tables +//////////////////////////////////////////////////////////////////////////////// + +extern DEVICE + omptarget_nvptx_Queue + omptarget_nvptx_device_State[MAX_SM]; + +//////////////////////////////////////////////////////////////////////////////// +// init entry points +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_kernel_init_params(void *Ptr) { + PRINT(LD_IO, "call to __kmpc_kernel_init_params with version %f\n", + OMPTARGET_NVPTX_VERSION); + + SetTeamsReductionScratchpadPtr(Ptr); +} + +EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) { + PRINT(LD_IO, "call to __kmpc_kernel_init with version %f\n", + OMPTARGET_NVPTX_VERSION); + ASSERT0(LT_FUSSY, RequiresOMPRuntime, + "Generic always requires initialized runtime."); + setExecutionParameters(Generic, RuntimeInitialized); + for (int I = 0; I < MAX_THREADS_PER_TEAM / WARPSIZE; ++I) + parallelLevel[I] = 0; + + int threadIdInBlock = GetThreadIdInBlock(); + ASSERT0(LT_FUSSY, threadIdInBlock == GetMasterThreadID(), + "__kmpc_kernel_init() must be called by team master warp only!"); + PRINT0(LD_IO, "call to __kmpc_kernel_init for master\n"); + + // Get a state object from the queue. + int slot = __kmpc_impl_smid() % MAX_SM; + usedSlotIdx = slot; + omptarget_nvptx_threadPrivateContext = + omptarget_nvptx_device_State[slot].Dequeue(); + + // init thread private + int threadId = GetLogicalThreadIdInBlock(/*isSPMDExecutionMode=*/false); + omptarget_nvptx_threadPrivateContext->InitThreadPrivateContext(threadId); + + // init team context + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + currTeamDescr.InitTeamDescr(); + // this thread will start execution... has to update its task ICV + // to point to the level zero task ICV. That ICV was init in + // InitTeamDescr() + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTeamDescr.LevelZeroTaskDescr()); + + // set number of threads and thread limit in team to started value + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + nThreads = GetNumberOfThreadsInBlock(); + threadLimit = ThreadLimit; +} + +EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO, "call to __kmpc_kernel_deinit\n"); + ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, + "Generic always requires initialized runtime."); + // Enqueue omp state object for use by another team. + int slot = usedSlotIdx; + omptarget_nvptx_device_State[slot].Enqueue( + omptarget_nvptx_threadPrivateContext); + // Done with work. Kill the workers. + omptarget_nvptx_workFn = 0; +} + +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing) { + PRINT0(LD_IO, "call to __kmpc_spmd_kernel_init\n"); + + setExecutionParameters(Spmd, RequiresOMPRuntime ? RuntimeInitialized + : RuntimeUninitialized); + int threadId = GetThreadIdInBlock(); + if (threadId == 0) { + usedSlotIdx = __kmpc_impl_smid() % MAX_SM; + parallelLevel[0] = + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + } else if (GetLaneId() == 0) { + parallelLevel[GetWarpId()] = + 1 + (GetNumberOfThreadsInBlock() > 1 ? OMP_ACTIVE_PARALLEL_LEVEL : 0); + } + if (!RequiresOMPRuntime) { + // Runtime is not required - exit. + __kmpc_impl_syncthreads(); + return; + } + + // + // Team Context Initialization. + // + // In SPMD mode there is no master thread so use any cuda thread for team + // context initialization. + if (threadId == 0) { + // Get a state object from the queue. + omptarget_nvptx_threadPrivateContext = + omptarget_nvptx_device_State[usedSlotIdx].Dequeue(); + + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + // init team context + currTeamDescr.InitTeamDescr(); + } + __kmpc_impl_syncthreads(); + + omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + + // + // Initialize task descr for each thread. + // + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr()); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + + // init thread private from init value + PRINT(LD_PAR, + "thread will execute parallel region with id %d in a team of " + "%d threads\n", + (int)newTaskDescr->ThreadId(), (int)ThreadLimit); + + if (RequiresDataSharing && GetLaneId() == 0) { + // Warp master initializes data sharing environment. + unsigned WID = threadId / WARPSIZE; + __kmpc_data_sharing_slot *RootS = currTeamDescr.RootS( + WID, WID == WARPSIZE - 1); + DataSharingState.SlotPtr[WID] = RootS; + DataSharingState.StackPtr[WID] = (void *)&RootS->Data[0]; + } +} + +EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit() { + __kmpc_spmd_kernel_deinit_v2(isRuntimeInitialized()); +} + +EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) { + // We're not going to pop the task descr stack of each thread since + // there are no more parallel regions in SPMD mode. + if (!RequiresOMPRuntime) + return; + + __kmpc_impl_syncthreads(); + int threadId = GetThreadIdInBlock(); + if (threadId == 0) { + // Enqueue omp state object for use by another team. + int slot = usedSlotIdx; + omptarget_nvptx_device_State[slot].Enqueue( + omptarget_nvptx_threadPrivateContext); + } +} + +// Return true if the current target region is executed in SPMD mode. +EXTERN int8_t __kmpc_is_spmd_exec_mode() { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_is_spmd_exec_mode\n"); + return isSPMDMode(); +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu index ab031e99e51f9..c7c41021d4bbc 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/parallel.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/parallel.cu @@ -1,470 +1,470 @@ -//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Parallel implementation in the GPU. Here is the pattern: -// -// while (not finished) { -// -// if (master) { -// sequential code, decide which par loop to do, or if finished -// __kmpc_kernel_prepare_parallel() // exec by master only -// } -// syncthreads // A -// __kmpc_kernel_parallel() // exec by all -// if (this thread is included in the parallel) { -// switch () for all parallel loops -// __kmpc_kernel_end_parallel() // exec only by threads in parallel -// } -// -// -// The reason we don't exec end_parallel for the threads not included -// in the parallel loop is that for each barrier in the parallel -// region, these non-included threads will cycle through the -// syncthread A. Thus they must preserve their current threadId that -// is larger than thread in team. -// -// To make a long story short... -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" -#include "target_impl.h" - -typedef struct ConvergentSimdJob { - omptarget_nvptx_TaskDescr taskDescr; - omptarget_nvptx_TaskDescr *convHeadTaskDescr; - uint16_t slimForNextSimd; -} ConvergentSimdJob; - -//////////////////////////////////////////////////////////////////////////////// -// support for convergent simd (team of threads in a warp only) -//////////////////////////////////////////////////////////////////////////////// -EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, int32_t *LaneSource, - int32_t *LaneId, int32_t *NumLanes) { - PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); - __kmpc_impl_lanemask_t ConvergentMask = Mask; - int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); - __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); - *LaneSource += __kmpc_impl_ffs(WorkRemaining); - *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; - __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); - *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); - - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; - - ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; - int32_t SimdLimit = - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); - job->slimForNextSimd = SimdLimit; - - int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource); - // reset simdlimit to avoid propagating to successive #simd - if (SimdLimitSource > 0 && threadId == sourceThreadId) - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; - - // We cannot have more than the # of convergent threads. - if (SimdLimitSource > 0) - *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource); - else - *NumLanes = ConvergentSize; - ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", - (int)*NumLanes); - - // Set to true for lanes participating in the simd region. - bool isActive = false; - // Initialize state for active threads. - if (*LaneId < *NumLanes) { - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - omptarget_nvptx_TaskDescr *sourceTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( - sourceThreadId); - job->convHeadTaskDescr = currTaskDescr; - // install top descriptor from the thread for which the lanes are working. - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - sourceTaskDescr); - isActive = true; - } - - // requires a memory fence between threads of a warp - return isActive; -} - -EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); - // pop stack - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = - job->slimForNextSimd; - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, job->convHeadTaskDescr); -} - -typedef struct ConvergentParallelJob { - omptarget_nvptx_TaskDescr taskDescr; - omptarget_nvptx_TaskDescr *convHeadTaskDescr; - uint16_t tnumForNextPar; -} ConvergentParallelJob; - -//////////////////////////////////////////////////////////////////////////////// -// support for convergent parallelism (team of threads in a warp only) -//////////////////////////////////////////////////////////////////////////////// -EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, - int32_t *LaneSource) { - PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); - __kmpc_impl_lanemask_t ConvergentMask = Mask; - int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); - __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); - *LaneSource += __kmpc_impl_ffs(WorkRemaining); - *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; - __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); - uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); - - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; - - ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; - int32_t NumThreadsClause = - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - job->tnumForNextPar = NumThreadsClause; - - int32_t NumThreadsSource = - __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource); - // reset numthreads to avoid propagating to successive #parallel - if (NumThreadsSource > 0 && threadId == sourceThreadId) - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = - 0; - - // We cannot have more than the # of convergent threads. - uint16_t NumThreads; - if (NumThreadsSource > 0) - NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource); - else - NumThreads = ConvergentSize; - ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", - (int)NumThreads); - - // Set to true for workers participating in the parallel region. - bool isActive = false; - // Initialize state for active threads. - if (OmpId < NumThreads) { - // init L2 task descriptor and storage for the L1 parallel task descriptor. - omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr; - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - omptarget_nvptx_TaskDescr *sourceTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( - sourceThreadId); - job->convHeadTaskDescr = currTaskDescr; - newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads); - // install new top descriptor - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); - isActive = true; - } - - // requires a memory fence between threads of a warp - return isActive; -} - -EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); - // pop stack - int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); - ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, job->convHeadTaskDescr); - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = - job->tnumForNextPar; -} - -//////////////////////////////////////////////////////////////////////////////// -// support for parallel that goes parallel (1 static level only) -//////////////////////////////////////////////////////////////////////////////// - -INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, - uint16_t NThreadsICV, - uint16_t ThreadLimit) { - uint16_t ThreadsRequested = NThreadsICV; - if (NumThreadsClause != 0) { - ThreadsRequested = NumThreadsClause; - } - - uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam(); - if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) { - ThreadsAvailable = ThreadLimit; - } - - uint16_t NumThreads = ThreadsAvailable; - if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) { - NumThreads = ThreadsRequested; - } - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - // On Volta and newer architectures we require that all lanes in - // a warp participate in the parallel region. Round down to a - // multiple of WARPSIZE since it is legal to do so in OpenMP. - if (NumThreads < WARPSIZE) { - NumThreads = 1; - } else { - NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1)); - } -#endif - - return NumThreads; -} - -// This routine is always called by the team master.. -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, - int16_t IsOMPRuntimeInitialized) { - PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); - ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime."); - - omptarget_nvptx_workFn = WorkFn; - - // This routine is only called by the team master. The team master is - // the first thread of the last warp. It always has the logical thread - // id of 0 (since it is a shadow for the first worker thread). - const int threadId = 0; - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), - "cannot be called in a parallel region."); - if (currTaskDescr->InParallelRegion()) { - PRINT0(LD_PAR, "already in parallel: go seq\n"); - return; - } - - uint16_t &NumThreadsClause = - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); - - uint16_t NumThreads = - determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit); - - if (NumThreadsClause != 0) { - // Reset request to avoid propagating to successive #parallel - NumThreadsClause = 0; - } - - ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", - (int)NumThreads); - ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), - "only team master can create parallel"); - - // Set number of threads on work descriptor. - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr); - threadsInTeam = NumThreads; -} - -// All workers call this function. Deactivate those not needed. -// Fn - the outlined work function to execute. -// returns True if this thread is active, else False. -// -// Only the worker threads call this routine. -EXTERN bool __kmpc_kernel_parallel(void **WorkFn, - int16_t IsOMPRuntimeInitialized) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); - - ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime."); - - // Work function and arguments for L1 parallel region. - *WorkFn = omptarget_nvptx_workFn; - - // If this is the termination signal from the master, quit early. - if (!*WorkFn) { - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n"); - return false; - } - - // Only the worker threads call this routine and the master warp - // never arrives here. Therefore, use the nvptx thread id. - int threadId = GetThreadIdInBlock(); - omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); - // Set to true for workers participating in the parallel region. - bool isActive = false; - // Initialize state for active threads. - if (threadId < threadsInTeam) { - // init work descriptor from workdesccr - omptarget_nvptx_TaskDescr *newTaskDescr = - omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); - ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); - newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); - // install new top descriptor - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); - // init private from int value - PRINT(LD_PAR, - "thread will execute parallel region with id %d in a team of " - "%d threads\n", - (int)newTaskDescr->ThreadId(), (int)nThreads); - - isActive = true; - // Reconverge the threads at the end of the parallel region to correctly - // handle parallel levels. - // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole - // warp. If only 1 thread is active, not need to reconverge the threads. - // If we have the whole warp, reconverge all the threads in the warp before - // actually trying to change the parallel level. Otherwise, parallel level - // can be changed incorrectly because of threads divergence. - bool IsActiveParallelRegion = threadsInTeam != 1; - IncParallelLevel(IsActiveParallelRegion, - IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); - } - - return isActive; -} - -EXTERN void __kmpc_kernel_end_parallel() { - // pop stack - PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); - ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); - - // Only the worker threads call this routine and the master warp - // never arrives here. Therefore, use the nvptx thread id. - int threadId = GetThreadIdInBlock(); - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, currTaskDescr->GetPrevTaskDescr()); - - // Reconverge the threads at the end of the parallel region to correctly - // handle parallel levels. - // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole - // warp. If only 1 thread is active, not need to reconverge the threads. - // If we have the whole warp, reconverge all the threads in the warp before - // actually trying to change the parallel level. Otherwise, parallel level can - // be changed incorrectly because of threads divergence. - bool IsActiveParallelRegion = threadsInTeam != 1; - DecParallelLevel(IsActiveParallelRegion, - IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); -} - -//////////////////////////////////////////////////////////////////////////////// -// support for parallel that goes sequential -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); - - IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); - - if (checkRuntimeUninitialized(loc)) { - ASSERT0(LT_FUSSY, checkSPMDMode(loc), - "Expected SPMD mode with uninitialized runtime."); - return; - } - - // assume this is only called for nested parallel - int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - - // unlike actual parallel, threads in the same team do not share - // the workTaskDescr in this case and num threads is fixed to 1 - - // get current task - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->SaveLoopData(); - - // allocate new task descriptor and copy value from current one, set prev to - // it - omptarget_nvptx_TaskDescr *newTaskDescr = - (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), - "new seq parallel task"); - newTaskDescr->CopyParent(currTaskDescr); - - // tweak values for serialized parallel case: - // - each thread becomes ID 0 in its serialized parallel, and - // - there is only one thread per team - newTaskDescr->ThreadId() = 0; - - // set new task descriptor as top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, - newTaskDescr); -} - -EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, - uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); - - DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); - - if (checkRuntimeUninitialized(loc)) { - ASSERT0(LT_FUSSY, checkSPMDMode(loc), - "Expected SPMD mode with uninitialized runtime."); - return; - } - - // pop stack - int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); - // set new top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( - threadId, currTaskDescr->GetPrevTaskDescr()); - // free - SafeFree(currTaskDescr, "new seq parallel task"); - currTaskDescr = getMyTopTaskDescriptor(threadId); - currTaskDescr->RestoreLoopData(); -} - -EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); - - return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); -} - -// This kmpc call returns the thread id across all teams. It's value is -// cached by the compiler and used when calling the runtime. On nvptx -// it's cheap to recalculate this value so we never use the result -// of this call. -EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - return GetOmpThreadId(tid, checkSPMDMode(loc)); -} - -//////////////////////////////////////////////////////////////////////////////// -// push params -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, - int32_t num_threads) { - PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = - num_threads; -} - -EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid, - int32_t simd_limit) { - PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); - tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; -} - -// Do nothing. The host guarantees we started the requested number of -// teams and we only need inspection of gridDim. - -EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, - int32_t num_teams, int32_t thread_limit) { - PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams); - ASSERT0(LT_FUSSY, 0, - "should never have anything with new teams on device"); -} - -EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, - int proc_bind) { - PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); -} +//===---- parallel.cu - GPU OpenMP parallel implementation ------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Parallel implementation in the GPU. Here is the pattern: +// +// while (not finished) { +// +// if (master) { +// sequential code, decide which par loop to do, or if finished +// __kmpc_kernel_prepare_parallel() // exec by master only +// } +// syncthreads // A +// __kmpc_kernel_parallel() // exec by all +// if (this thread is included in the parallel) { +// switch () for all parallel loops +// __kmpc_kernel_end_parallel() // exec only by threads in parallel +// } +// +// +// The reason we don't exec end_parallel for the threads not included +// in the parallel loop is that for each barrier in the parallel +// region, these non-included threads will cycle through the +// syncthread A. Thus they must preserve their current threadId that +// is larger than thread in team. +// +// To make a long story short... +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" +#include "target_impl.h" + +typedef struct ConvergentSimdJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t slimForNextSimd; +} ConvergentSimdJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent simd (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, + __kmpc_impl_lanemask_t Mask, + bool *IsFinal, int32_t *LaneSource, + int32_t *LaneId, int32_t *NumLanes) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n"); + __kmpc_impl_lanemask_t ConvergentMask = Mask; + int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); + __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __kmpc_impl_ffs(WorkRemaining); + *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; + __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); + *LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + int32_t SimdLimit = + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId); + job->slimForNextSimd = SimdLimit; + + int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource); + // reset simdlimit to avoid propagating to successive #simd + if (SimdLimitSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0; + + // We cannot have more than the # of convergent threads. + if (SimdLimitSource > 0) + *NumLanes = __kmpc_impl_min(ConvergentSize, SimdLimitSource); + else + *NumLanes = ConvergentSize; + ASSERT(LT_FUSSY, *NumLanes > 0, "bad thread request of %d threads", + (int)*NumLanes); + + // Set to true for lanes participating in the simd region. + bool isActive = false; + // Initialize state for active threads. + if (*LaneId < *NumLanes) { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + // install top descriptor from the thread for which the lanes are working. + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + sourceTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + ConvergentSimdJob *job = (ConvergentSimdJob *)buffer; + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = + job->slimForNextSimd; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); +} + +typedef struct ConvergentParallelJob { + omptarget_nvptx_TaskDescr taskDescr; + omptarget_nvptx_TaskDescr *convHeadTaskDescr; + uint16_t tnumForNextPar; +} ConvergentParallelJob; + +//////////////////////////////////////////////////////////////////////////////// +// support for convergent parallelism (team of threads in a warp only) +//////////////////////////////////////////////////////////////////////////////// +EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, + __kmpc_impl_lanemask_t Mask, + bool *IsFinal, + int32_t *LaneSource) { + PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n"); + __kmpc_impl_lanemask_t ConvergentMask = Mask; + int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask); + __kmpc_impl_lanemask_t WorkRemaining = ConvergentMask >> (*LaneSource + 1); + *LaneSource += __kmpc_impl_ffs(WorkRemaining); + *IsFinal = __kmpc_impl_popc(WorkRemaining) == 1; + __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); + uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt); + + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource; + + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + int32_t NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + job->tnumForNextPar = NumThreadsClause; + + int32_t NumThreadsSource = + __kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource); + // reset numthreads to avoid propagating to successive #parallel + if (NumThreadsSource > 0 && threadId == sourceThreadId) + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + 0; + + // We cannot have more than the # of convergent threads. + uint16_t NumThreads; + if (NumThreadsSource > 0) + NumThreads = __kmpc_impl_min(ConvergentSize, NumThreadsSource); + else + NumThreads = ConvergentSize; + ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", + (int)NumThreads); + + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (OmpId < NumThreads) { + // init L2 task descriptor and storage for the L1 parallel task descriptor. + omptarget_nvptx_TaskDescr *newTaskDescr = &job->taskDescr; + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + omptarget_nvptx_TaskDescr *sourceTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr( + sourceThreadId); + job->convHeadTaskDescr = currTaskDescr; + newTaskDescr->CopyConvergentParent(sourceTaskDescr, OmpId, NumThreads); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + isActive = true; + } + + // requires a memory fence between threads of a warp + return isActive; +} + +EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_convergent_parallel\n"); + // pop stack + int threadId = GetLogicalThreadIdInBlock(isSPMDMode()); + ConvergentParallelJob *job = (ConvergentParallelJob *)buffer; + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, job->convHeadTaskDescr); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) = + job->tnumForNextPar; +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes parallel (1 static level only) +//////////////////////////////////////////////////////////////////////////////// + +INLINE static uint16_t determineNumberOfThreads(uint16_t NumThreadsClause, + uint16_t NThreadsICV, + uint16_t ThreadLimit) { + uint16_t ThreadsRequested = NThreadsICV; + if (NumThreadsClause != 0) { + ThreadsRequested = NumThreadsClause; + } + + uint16_t ThreadsAvailable = GetNumberOfWorkersInTeam(); + if (ThreadLimit != 0 && ThreadLimit < ThreadsAvailable) { + ThreadsAvailable = ThreadLimit; + } + + uint16_t NumThreads = ThreadsAvailable; + if (ThreadsRequested != 0 && ThreadsRequested < NumThreads) { + NumThreads = ThreadsRequested; + } + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + // On Volta and newer architectures we require that all lanes in + // a warp participate in the parallel region. Round down to a + // multiple of WARPSIZE since it is legal to do so in OpenMP. + if (NumThreads < WARPSIZE) { + NumThreads = 1; + } else { + NumThreads = (NumThreads & ~((uint16_t)WARPSIZE - 1)); + } +#endif + + return NumThreads; +} + +// This routine is always called by the team master.. +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO, "call to __kmpc_kernel_prepare_parallel\n"); + ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime."); + + omptarget_nvptx_workFn = WorkFn; + + // This routine is only called by the team master. The team master is + // the first thread of the last warp. It always has the logical thread + // id of 0 (since it is a shadow for the first worker thread). + const int threadId = 0; + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(), + "cannot be called in a parallel region."); + if (currTaskDescr->InParallelRegion()) { + PRINT0(LD_PAR, "already in parallel: go seq\n"); + return; + } + + uint16_t &NumThreadsClause = + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId); + + uint16_t NumThreads = + determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit); + + if (NumThreadsClause != 0) { + // Reset request to avoid propagating to successive #parallel + NumThreadsClause = 0; + } + + ASSERT(LT_FUSSY, NumThreads > 0, "bad thread request of %d threads", + (int)NumThreads); + ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(), + "only team master can create parallel"); + + // Set number of threads on work descriptor. + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr); + threadsInTeam = NumThreads; +} + +// All workers call this function. Deactivate those not needed. +// Fn - the outlined work function to execute. +// returns True if this thread is active, else False. +// +// Only the worker threads call this routine. +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + int16_t IsOMPRuntimeInitialized) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel\n"); + + ASSERT0(LT_FUSSY, IsOMPRuntimeInitialized, "Expected initialized runtime."); + + // Work function and arguments for L1 parallel region. + *WorkFn = omptarget_nvptx_workFn; + + // If this is the termination signal from the master, quit early. + if (!*WorkFn) { + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_parallel finished\n"); + return false; + } + + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor(); + // Set to true for workers participating in the parallel region. + bool isActive = false; + // Initialize state for active threads. + if (threadId < threadsInTeam) { + // init work descriptor from workdesccr + omptarget_nvptx_TaskDescr *newTaskDescr = + omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId); + ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr"); + newTaskDescr->CopyFromWorkDescr(workDescr.WorkTaskDescr()); + // install new top descriptor + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); + // init private from int value + PRINT(LD_PAR, + "thread will execute parallel region with id %d in a team of " + "%d threads\n", + (int)newTaskDescr->ThreadId(), (int)nThreads); + + isActive = true; + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level + // can be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + IncParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); + } + + return isActive; +} + +EXTERN void __kmpc_kernel_end_parallel() { + // pop stack + PRINT0(LD_IO | LD_PAR, "call to __kmpc_kernel_end_parallel\n"); + ASSERT0(LT_FUSSY, isRuntimeInitialized(), "Expected initialized runtime."); + + // Only the worker threads call this routine and the master warp + // never arrives here. Therefore, use the nvptx thread id. + int threadId = GetThreadIdInBlock(); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); + + // Reconverge the threads at the end of the parallel region to correctly + // handle parallel levels. + // In Cuda9+ in non-SPMD mode we have either 1 worker thread or the whole + // warp. If only 1 thread is active, not need to reconverge the threads. + // If we have the whole warp, reconverge all the threads in the warp before + // actually trying to change the parallel level. Otherwise, parallel level can + // be changed incorrectly because of threads divergence. + bool IsActiveParallelRegion = threadsInTeam != 1; + DecParallelLevel(IsActiveParallelRegion, + IsActiveParallelRegion ? __kmpc_impl_all_lanes : 1u); +} + +//////////////////////////////////////////////////////////////////////////////// +// support for parallel that goes sequential +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_serialized_parallel\n"); + + IncParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); + + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), + "Expected SPMD mode with uninitialized runtime."); + return; + } + + // assume this is only called for nested parallel + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + + // unlike actual parallel, threads in the same team do not share + // the workTaskDescr in this case and num threads is fixed to 1 + + // get current task + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->SaveLoopData(); + + // allocate new task descriptor and copy value from current one, set prev to + // it + omptarget_nvptx_TaskDescr *newTaskDescr = + (omptarget_nvptx_TaskDescr *)SafeMalloc(sizeof(omptarget_nvptx_TaskDescr), + "new seq parallel task"); + newTaskDescr->CopyParent(currTaskDescr); + + // tweak values for serialized parallel case: + // - each thread becomes ID 0 in its serialized parallel, and + // - there is only one thread per team + newTaskDescr->ThreadId() = 0; + + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId, + newTaskDescr); +} + +EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, + uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_end_serialized_parallel\n"); + + DecParallelLevel(/*ActiveParallel=*/false, __kmpc_impl_activemask()); + + if (checkRuntimeUninitialized(loc)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc), + "Expected SPMD mode with uninitialized runtime."); + return; + } + + // pop stack + int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); + // set new top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr( + threadId, currTaskDescr->GetPrevTaskDescr()); + // free + SafeFree(currTaskDescr, "new seq parallel task"); + currTaskDescr = getMyTopTaskDescriptor(threadId); + currTaskDescr->RestoreLoopData(); +} + +EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_parallel_level\n"); + + return parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1); +} + +// This kmpc call returns the thread id across all teams. It's value is +// cached by the compiler and used when calling the runtime. On nvptx +// it's cheap to recalculate this value so we never use the result +// of this call. +EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc) { + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + return GetOmpThreadId(tid, checkSPMDMode(loc)); +} + +//////////////////////////////////////////////////////////////////////////////// +// push params +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t tid, + int32_t num_threads) { + PRINT(LD_IO, "call kmpc_push_num_threads %d\n", num_threads); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(tid) = + num_threads; +} + +EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t tid, + int32_t simd_limit) { + PRINT(LD_IO, "call kmpc_push_simd_limit %d\n", (int)simd_limit); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), "Runtime must be initialized."); + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(tid) = simd_limit; +} + +// Do nothing. The host guarantees we started the requested number of +// teams and we only need inspection of gridDim. + +EXTERN void __kmpc_push_num_teams(kmp_Ident *loc, int32_t tid, + int32_t num_teams, int32_t thread_limit) { + PRINT(LD_IO, "call kmpc_push_num_teams %d\n", (int)num_teams); + ASSERT0(LT_FUSSY, 0, + "should never have anything with new teams on device"); +} + +EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t tid, + int proc_bind) { + PRINT(LD_IO, "call kmpc_push_proc_bind %d\n", (int)proc_bind); +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu index 427c90a7e0913..04ec735674603 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/reduction.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/reduction.cu @@ -1,531 +1,531 @@ -//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of reduction with KMPC interface. -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" -#include "common/target_atomic.h" -#include "target_impl.h" - -EXTERN -void __kmpc_nvptx_end_reduce(int32_t global_tid) {} - -EXTERN -void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} - -EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { - return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size); -} - -EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { - uint32_t lo, hi; - __kmpc_impl_unpack(val, lo, hi); - hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size); - lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size); - return __kmpc_impl_pack(lo, hi); -} - -INLINE static void gpu_regular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr shflFct) { - for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { - shflFct(reduce_data, /*LaneId - not used= */ 0, - /*Offset = */ mask, /*AlgoVersion=*/0); - } -} - -INLINE static void gpu_irregular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - uint32_t size, uint32_t tid) { - uint32_t curr_size; - uint32_t mask; - curr_size = size; - mask = curr_size / 2; - while (mask > 0) { - shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); - curr_size = (curr_size + 1) / 2; - mask = curr_size / 2; - } -} - -INLINE static uint32_t -gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { - uint32_t size, remote_id, physical_lane_id; - physical_lane_id = GetThreadIdInBlock() % WARPSIZE; - __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2; - __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt(); - do { - Liveness = __kmpc_impl_activemask(); - remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt); - size = __kmpc_impl_popc(Liveness); - logical_lane_id /= 2; - shflFct(reduce_data, /*LaneId =*/logical_lane_id, - /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); - } while (logical_lane_id % 2 == 0 && size > 1); - return (logical_lane_id == 0); -} - -EXTERN -int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars, - size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct) { - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - if (Liveness == __kmpc_impl_all_lanes) { - gpu_regular_warp_reduce(reduce_data, shflFct); - return GetThreadIdInBlock() % WARPSIZE == - 0; // Result on lane 0 of the simd warp. - } else { - return gpu_irregular_simd_reduce( - reduce_data, shflFct); // Result on the first active lane. - } -} - -INLINE -static int32_t nvptx_parallel_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - bool isSPMDExecutionMode, bool isRuntimeUninitialized) { - uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode); - if (NumThreads == 1) - return 1; - /* - * This reduce function handles reduction within a team. It handles - * parallel regions in both L1 and L2 parallelism levels. It also - * supports Generic, SPMD, and NoOMP modes. - * - * 1. Reduce within a warp. - * 2. Warp master copies value to warp 0 via shared memory. - * 3. Warp 0 reduces to a single value. - * 4. The reduced value is available in the thread that returns 1. - */ - -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; - uint32_t WarpId = BlockThreadId / WARPSIZE; - - // Volta execution model: - // For the Generic execution mode a parallel region either has 1 thread and - // beyond that, always a multiple of 32. For the SPMD execution mode we may - // have any number of threads. - if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/NumThreads % WARPSIZE, - /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > WARPSIZE) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - } - return BlockThreadId == 0; -#else - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - if (Liveness == __kmpc_impl_all_lanes) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/__kmpc_impl_popc(Liveness), - /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); - else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 - // parallel region may enter here; return - // early. - return gpu_irregular_simd_reduce(reduce_data, shflFct); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - // - // Only L1 parallel region can enter this if condition. - if (NumThreads > WARPSIZE) { - uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = BlockThreadId / WARPSIZE; - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - BlockThreadId); - - return BlockThreadId == 0; - } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) { - return BlockThreadId == 0; - } - - // Get the OMP thread Id. This is different from BlockThreadId in the case of - // an L2 parallel region. - return global_tid == 0; -#endif // __CUDA_ARCH__ >= 700 -} - -EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, - isSPMDMode(), isRuntimeUninitialized()); -} - -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, - void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - checkSPMDMode(loc), checkRuntimeUninitialized(loc)); -} - -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true); -} - -EXTERN -int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { - return nvptx_parallel_reduce_nowait( - global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, - /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true); -} - -INLINE -static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, - size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, - kmp_LoadReduceFctPtr ldFct, - bool isSPMDExecutionMode) { - uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = - isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) - : /*Master thread only*/ 1; - uint32_t TeamId = GetBlockIdInKernel(); - uint32_t NumTeams = GetNumberOfBlocksInKernel(); - static SHARED volatile bool IsLastTeam; - - // Team masters of all teams write to the scratchpad. - if (ThreadId == 0) { - unsigned int *timestamp = GetTeamsReductionTimestamp(); - char *scratchpad = GetTeamsReductionScratchpad(); - - scratchFct(reduce_data, scratchpad, TeamId, NumTeams); - __kmpc_impl_threadfence(); - - // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. - // It resets 'timestamp' back to 0 once the last team increments - // this counter. - unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1); - IsLastTeam = val == NumTeams - 1; - } - - // We have to wait on L1 barrier because in GENERIC mode the workers - // are waiting on barrier 0 for work. - // - // If we guard this barrier as follows it leads to deadlock, probably - // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); - uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; - __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE); - - // If this team is not the last, quit. - if (/* Volatile read by all threads */ !IsLastTeam) - return 0; - - // - // Last team processing. - // - - // Threads in excess of #teams do not participate in reduction of the - // scratchpad values. -#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 - uint32_t ActiveThreads = NumThreads; - if (NumTeams < NumThreads) { - ActiveThreads = - (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1); - } - if (ThreadId >= ActiveThreads) - return 0; - - // Load from scratchpad and reduce. - char *scratchpad = GetTeamsReductionScratchpad(); - ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); - for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads) - ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); - - uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; - uint32_t WarpId = ThreadId / WARPSIZE; - - // Reduce across warps to the warp master. - if ((ActiveThreads % WARPSIZE == 0) || - (WarpId < WarpsNeeded - 1)) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else if (ActiveThreads > 1) // Partial warp but contiguous lanes - // Only SPMD execution mode comes thru this case. - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/ActiveThreads % WARPSIZE, - /*LaneId=*/ThreadId % WARPSIZE); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - if (ActiveThreads > WARPSIZE) { - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); - } -#else - if (ThreadId >= NumTeams) - return 0; - - // Load from scratchpad and reduce. - char *scratchpad = GetTeamsReductionScratchpad(); - ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); - for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) - ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); - - // Reduce across warps to the warp master. - __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); - if (Liveness == __kmpc_impl_all_lanes) // Full warp - gpu_regular_warp_reduce(reduce_data, shflFct); - else // Partial warp but contiguous lanes - gpu_irregular_warp_reduce(reduce_data, shflFct, - /*LaneCount=*/__kmpc_impl_popc(Liveness), - /*LaneId=*/ThreadId % WARPSIZE); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads; - if (ActiveThreads > WARPSIZE) { - uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / WARPSIZE; - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); - } -#endif // __CUDA_ARCH__ >= 700 - - return ThreadId == 0; -} - -EXTERN -int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, - size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, - kmp_LoadReduceFctPtr ldFct) { - return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, scratchFct, - ldFct, isSPMDMode()); -} - -EXTERN -int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { - return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, scratchFct, - ldFct, /*isSPMDExecutionMode=*/true); -} - -EXTERN -int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { - return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, - reduce_data, shflFct, cpyFct, scratchFct, - ldFct, /*isSPMDExecutionMode=*/false); -} - -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, - int32_t global_tid, - kmp_CriticalName *crit) { - if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0) - return 0; - // The master thread of the team actually does the reduction. - while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u)) - ; - return 1; -} - -EXTERN void -__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit) { - __kmpc_impl_threadfence_system(); - (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u); -} - -INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) { - return checkGenericMode(loc) || IsTeamMaster(ThreadId); -} - -INLINE static uint32_t roundToWarpsize(uint32_t s) { - if (s < WARPSIZE) - return 1; - return (s & ~(unsigned)(WARPSIZE - 1)); -} - -DEVICE static volatile uint32_t IterCnt = 0; -DEVICE static volatile uint32_t Cnt = 0; -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, void *global_buffer, - int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, - kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, - kmp_ListGlobalFctPtr glredFct) { - - // Terminate all threads in non-SPMD mode except for the master thread. - if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID()) - return 0; - - uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - - // In non-generic mode all workers participate in the teams reduction. - // In generic mode only the team master participates in the teams - // reduction because the workers are waiting for parallel work. - uint32_t NumThreads = - checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) - : /*Master thread only*/ 1; - uint32_t TeamId = GetBlockIdInKernel(); - uint32_t NumTeams = GetNumberOfBlocksInKernel(); - static SHARED unsigned Bound; - static SHARED unsigned ChunkTeamCount; - - // Block progress for teams greater than the current upper - // limit. We always only allow a number of teams less or equal - // to the number of slots in the buffer. - bool IsMaster = isMaster(loc, ThreadId); - while (IsMaster) { - // Atomic read - Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u); - if (TeamId < Bound + num_of_records) - break; - } - - if (IsMaster) { - int ModBockId = TeamId % num_of_records; - if (TeamId < num_of_records) - lgcpyFct(global_buffer, ModBockId, reduce_data); - else - lgredFct(global_buffer, ModBockId, reduce_data); - __kmpc_impl_threadfence_system(); - - // Increment team counter. - // This counter is incremented by all teams in the current - // BUFFER_SIZE chunk. - ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u); - } - // Synchronize - if (checkSPMDMode(loc)) - __kmpc_barrier(loc, global_tid); - - // reduce_data is global or shared so before being reduced within the - // warp we need to bring it in local memory: - // local_reduce_data = reduce_data[i] - // - // Example for 3 reduction variables a, b, c (of potentially different - // types): - // - // buffer layout (struct of arrays): - // a, a, ..., a, b, b, ... b, c, c, ... c - // |__________| - // num_of_records - // - // local_data_reduce layout (struct): - // a, b, c - // - // Each thread will have a local struct containing the values to be - // reduced: - // 1. do reduction within each warp. - // 2. do reduction across warps. - // 3. write the final result to the main reduction variable - // by returning 1 in the thread holding the reduction result. - - // Check if this is the very last team. - unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records)); - if (ChunkTeamCount == NumTeams - Bound - 1) { - // - // Last team processing. - // - if (ThreadId >= NumRecs) - return 0; - NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs)); - if (ThreadId >= NumThreads) - return 0; - - // Load from buffer and reduce. - glcpyFct(global_buffer, ThreadId, reduce_data); - for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) - glredFct(global_buffer, i, reduce_data); - - // Reduce across warps to the warp master. - if (NumThreads > 1) { - gpu_regular_warp_reduce(reduce_data, shflFct); - - // When we have more than [warpsize] number of threads - // a block reduction is performed here. - uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads); - if (ActiveThreads > WARPSIZE) { - uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; - // Gather all the reduced values from each warp - // to the first warp. - cpyFct(reduce_data, WarpsNeeded); - - uint32_t WarpId = ThreadId / WARPSIZE; - if (WarpId == 0) - gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, - ThreadId); - } - } - - if (IsMaster) { - Cnt = 0; - IterCnt = 0; - return 1; - } - return 0; - } - if (IsMaster && ChunkTeamCount == num_of_records - 1) { - // Allow SIZE number of teams to proceed writing their - // intermediate results to the global buffer. - __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records)); - } - - return 0; -} - +//===---- reduction.cu - GPU OpenMP reduction implementation ----- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of reduction with KMPC interface. +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" +#include "common/target_atomic.h" +#include "target_impl.h" + +EXTERN +void __kmpc_nvptx_end_reduce(int32_t global_tid) {} + +EXTERN +void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} + +EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { + return __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, val, delta, size); +} + +EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { + uint32_t lo, hi; + __kmpc_impl_unpack(val, lo, hi); + hi = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, hi, delta, size); + lo = __kmpc_impl_shfl_down_sync(__kmpc_impl_all_lanes, lo, delta, size); + return __kmpc_impl_pack(lo, hi); +} + +INLINE static void gpu_regular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct) { + for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { + shflFct(reduce_data, /*LaneId - not used= */ 0, + /*Offset = */ mask, /*AlgoVersion=*/0); + } +} + +INLINE static void gpu_irregular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + uint32_t size, uint32_t tid) { + uint32_t curr_size; + uint32_t mask; + curr_size = size; + mask = curr_size / 2; + while (mask > 0) { + shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); + curr_size = (curr_size + 1) / 2; + mask = curr_size / 2; + } +} + +INLINE static uint32_t +gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { + uint32_t size, remote_id, physical_lane_id; + physical_lane_id = GetThreadIdInBlock() % WARPSIZE; + __kmpc_impl_lanemask_t lanemask_lt = __kmpc_impl_lanemask_lt(); + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); + uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2; + __kmpc_impl_lanemask_t lanemask_gt = __kmpc_impl_lanemask_gt(); + do { + Liveness = __kmpc_impl_activemask(); + remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt); + size = __kmpc_impl_popc(Liveness); + logical_lane_id /= 2; + shflFct(reduce_data, /*LaneId =*/logical_lane_id, + /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); + } while (logical_lane_id % 2 == 0 && size > 1); + return (logical_lane_id == 0); +} + +EXTERN +int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); + if (Liveness == __kmpc_impl_all_lanes) { + gpu_regular_warp_reduce(reduce_data, shflFct); + return GetThreadIdInBlock() % WARPSIZE == + 0; // Result on lane 0 of the simd warp. + } else { + return gpu_irregular_simd_reduce( + reduce_data, shflFct); // Result on the first active lane. + } +} + +INLINE +static int32_t nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + bool isSPMDExecutionMode, bool isRuntimeUninitialized) { + uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode); + if (NumThreads == 1) + return 1; + /* + * This reduce function handles reduction within a team. It handles + * parallel regions in both L1 and L2 parallelism levels. It also + * supports Generic, SPMD, and NoOMP modes. + * + * 1. Reduce within a warp. + * 2. Warp master copies value to warp 0 via shared memory. + * 3. Warp 0 reduces to a single value. + * 4. The reduced value is available in the thread that returns 1. + */ + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = BlockThreadId / WARPSIZE; + + // Volta execution model: + // For the Generic execution mode a parallel region either has 1 thread and + // beyond that, always a multiple of 32. For the SPMD execution mode we may + // have any number of threads. + if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/NumThreads % WARPSIZE, + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + } + return BlockThreadId == 0; +#else + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); + if (Liveness == __kmpc_impl_all_lanes) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__kmpc_impl_popc(Liveness), + /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); + else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 + // parallel region may enter here; return + // early. + return gpu_irregular_simd_reduce(reduce_data, shflFct); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + // + // Only L1 parallel region can enter this if condition. + if (NumThreads > WARPSIZE) { + uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = BlockThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + BlockThreadId); + + return BlockThreadId == 0; + } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) { + return BlockThreadId == 0; + } + + // Get the OMP thread Id. This is different from BlockThreadId in the case of + // an L2 parallel region. + return global_tid == 0; +#endif // __CUDA_ARCH__ >= 700 +} + +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, + isSPMDMode(), isRuntimeUninitialized()); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + checkSPMDMode(loc), checkRuntimeUninitialized(loc)); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/true, /*isRuntimeUninitialized=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { + return nvptx_parallel_reduce_nowait( + global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, + /*isSPMDExecutionMode=*/false, /*isRuntimeUninitialized=*/true); +} + +INLINE +static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, + kmp_LoadReduceFctPtr ldFct, + bool isSPMDExecutionMode) { + uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team master participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = + isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) + : /*Master thread only*/ 1; + uint32_t TeamId = GetBlockIdInKernel(); + uint32_t NumTeams = GetNumberOfBlocksInKernel(); + static SHARED volatile bool IsLastTeam; + + // Team masters of all teams write to the scratchpad. + if (ThreadId == 0) { + unsigned int *timestamp = GetTeamsReductionTimestamp(); + char *scratchpad = GetTeamsReductionScratchpad(); + + scratchFct(reduce_data, scratchpad, TeamId, NumTeams); + __kmpc_impl_threadfence(); + + // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. + // It resets 'timestamp' back to 0 once the last team increments + // this counter. + unsigned val = __kmpc_atomic_inc(timestamp, NumTeams - 1); + IsLastTeam = val == NumTeams - 1; + } + + // We have to wait on L1 barrier because in GENERIC mode the workers + // are waiting on barrier 0 for work. + // + // If we guard this barrier as follows it leads to deadlock, probably + // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); + uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; + __kmpc_impl_named_sync(L1_BARRIER, SyncWarps * WARPSIZE); + + // If this team is not the last, quit. + if (/* Volatile read by all threads */ !IsLastTeam) + return 0; + + // + // Last team processing. + // + + // Threads in excess of #teams do not participate in reduction of the + // scratchpad values. +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 + uint32_t ActiveThreads = NumThreads; + if (NumTeams < NumThreads) { + ActiveThreads = + (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1); + } + if (ThreadId >= ActiveThreads) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + uint32_t WarpId = ThreadId / WARPSIZE; + + // Reduce across warps to the warp master. + if ((ActiveThreads % WARPSIZE == 0) || + (WarpId < WarpsNeeded - 1)) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else if (ActiveThreads > 1) // Partial warp but contiguous lanes + // Only SPMD execution mode comes thru this case. + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/ActiveThreads % WARPSIZE, + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + if (ActiveThreads > WARPSIZE) { + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#else + if (ThreadId >= NumTeams) + return 0; + + // Load from scratchpad and reduce. + char *scratchpad = GetTeamsReductionScratchpad(); + ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); + for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) + ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); + + // Reduce across warps to the warp master. + __kmpc_impl_lanemask_t Liveness = __kmpc_impl_activemask(); + if (Liveness == __kmpc_impl_all_lanes) // Full warp + gpu_regular_warp_reduce(reduce_data, shflFct); + else // Partial warp but contiguous lanes + gpu_irregular_warp_reduce(reduce_data, shflFct, + /*LaneCount=*/__kmpc_impl_popc(Liveness), + /*LaneId=*/ThreadId % WARPSIZE); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads; + if (ActiveThreads > WARPSIZE) { + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = ThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); + } +#endif // __CUDA_ARCH__ >= 700 + + return ThreadId == 0; +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, + size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, + kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, isSPMDMode()); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, /*isSPMDExecutionMode=*/true); +} + +EXTERN +int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { + return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, + reduce_data, shflFct, cpyFct, scratchFct, + ldFct, /*isSPMDExecutionMode=*/false); +} + +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, + int32_t global_tid, + kmp_CriticalName *crit) { + if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0) + return 0; + // The master thread of the team actually does the reduction. + while (__kmpc_atomic_cas((uint32_t *)crit, 0u, 1u)) + ; + return 1; +} + +EXTERN void +__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit) { + __kmpc_impl_threadfence_system(); + (void)__kmpc_atomic_exchange((uint32_t *)crit, 0u); +} + +INLINE static bool isMaster(kmp_Ident *loc, uint32_t ThreadId) { + return checkGenericMode(loc) || IsTeamMaster(ThreadId); +} + +INLINE static uint32_t roundToWarpsize(uint32_t s) { + if (s < WARPSIZE) + return 1; + return (s & ~(unsigned)(WARPSIZE - 1)); +} + +DEVICE static volatile uint32_t IterCnt = 0; +DEVICE static volatile uint32_t Cnt = 0; +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, void *global_buffer, + int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, + kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, + kmp_ListGlobalFctPtr glredFct) { + + // Terminate all threads in non-SPMD mode except for the master thread. + if (checkGenericMode(loc) && GetThreadIdInBlock() != GetMasterThreadID()) + return 0; + + uint32_t ThreadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + + // In non-generic mode all workers participate in the teams reduction. + // In generic mode only the team master participates in the teams + // reduction because the workers are waiting for parallel work. + uint32_t NumThreads = + checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true) + : /*Master thread only*/ 1; + uint32_t TeamId = GetBlockIdInKernel(); + uint32_t NumTeams = GetNumberOfBlocksInKernel(); + static SHARED unsigned Bound; + static SHARED unsigned ChunkTeamCount; + + // Block progress for teams greater than the current upper + // limit. We always only allow a number of teams less or equal + // to the number of slots in the buffer. + bool IsMaster = isMaster(loc, ThreadId); + while (IsMaster) { + // Atomic read + Bound = __kmpc_atomic_add((uint32_t *)&IterCnt, 0u); + if (TeamId < Bound + num_of_records) + break; + } + + if (IsMaster) { + int ModBockId = TeamId % num_of_records; + if (TeamId < num_of_records) + lgcpyFct(global_buffer, ModBockId, reduce_data); + else + lgredFct(global_buffer, ModBockId, reduce_data); + __kmpc_impl_threadfence_system(); + + // Increment team counter. + // This counter is incremented by all teams in the current + // BUFFER_SIZE chunk. + ChunkTeamCount = __kmpc_atomic_inc((uint32_t *)&Cnt, num_of_records - 1u); + } + // Synchronize + if (checkSPMDMode(loc)) + __kmpc_barrier(loc, global_tid); + + // reduce_data is global or shared so before being reduced within the + // warp we need to bring it in local memory: + // local_reduce_data = reduce_data[i] + // + // Example for 3 reduction variables a, b, c (of potentially different + // types): + // + // buffer layout (struct of arrays): + // a, a, ..., a, b, b, ... b, c, c, ... c + // |__________| + // num_of_records + // + // local_data_reduce layout (struct): + // a, b, c + // + // Each thread will have a local struct containing the values to be + // reduced: + // 1. do reduction within each warp. + // 2. do reduction across warps. + // 3. write the final result to the main reduction variable + // by returning 1 in the thread holding the reduction result. + + // Check if this is the very last team. + unsigned NumRecs = __kmpc_impl_min(NumTeams, uint32_t(num_of_records)); + if (ChunkTeamCount == NumTeams - Bound - 1) { + // + // Last team processing. + // + if (ThreadId >= NumRecs) + return 0; + NumThreads = roundToWarpsize(__kmpc_impl_min(NumThreads, NumRecs)); + if (ThreadId >= NumThreads) + return 0; + + // Load from buffer and reduce. + glcpyFct(global_buffer, ThreadId, reduce_data); + for (uint32_t i = NumThreads + ThreadId; i < NumRecs; i += NumThreads) + glredFct(global_buffer, i, reduce_data); + + // Reduce across warps to the warp master. + if (NumThreads > 1) { + gpu_regular_warp_reduce(reduce_data, shflFct); + + // When we have more than [warpsize] number of threads + // a block reduction is performed here. + uint32_t ActiveThreads = __kmpc_impl_min(NumRecs, NumThreads); + if (ActiveThreads > WARPSIZE) { + uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; + // Gather all the reduced values from each warp + // to the first warp. + cpyFct(reduce_data, WarpsNeeded); + + uint32_t WarpId = ThreadId / WARPSIZE; + if (WarpId == 0) + gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, + ThreadId); + } + } + + if (IsMaster) { + Cnt = 0; + IterCnt = 0; + return 1; + } + return 0; + } + if (IsMaster && ChunkTeamCount == num_of_records - 1) { + // Allow SIZE number of teams to proceed writing their + // intermediate results to the global buffer. + __kmpc_atomic_add((uint32_t *)&IterCnt, uint32_t(num_of_records)); + } + + return 0; +} + diff --git a/openmp/libomptarget/deviceRTLs/common/src/support.cu b/openmp/libomptarget/deviceRTLs/common/src/support.cu index 85747511d46c1..e7dfa83bc056d 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/support.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/support.cu @@ -1,269 +1,269 @@ -//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Wrapper implementation to some functions natively supported by the GPU. -// -//===----------------------------------------------------------------------===// - -#include "common/support.h" -#include "common/debug.h" -#include "common/omptarget.h" - -//////////////////////////////////////////////////////////////////////////////// -// Execution Parameters -//////////////////////////////////////////////////////////////////////////////// - -DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { - execution_param = EMode; - execution_param |= RMode; -} - -DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; } - -DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; } - -DEVICE bool isRuntimeUninitialized() { - return (execution_param & RuntimeMask) == RuntimeUninitialized; -} - -DEVICE bool isRuntimeInitialized() { - return (execution_param & RuntimeMask) == RuntimeInitialized; -} - -//////////////////////////////////////////////////////////////////////////////// -// Execution Modes based on location parameter fields -//////////////////////////////////////////////////////////////////////////////// - -DEVICE bool checkSPMDMode(kmp_Ident *loc) { - if (!loc) - return isSPMDMode(); - - // If SPMD is true then we are not in the UNDEFINED state so - // we can return immediately. - if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) - return true; - - // If not in SPMD mode and runtime required is a valid - // combination of flags so we can return immediately. - if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) - return false; - - // We are in underfined state. - return isSPMDMode(); -} - -DEVICE bool checkGenericMode(kmp_Ident *loc) { - return !checkSPMDMode(loc); -} - -DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) { - if (!loc) - return isRuntimeUninitialized(); - - // If runtime is required then we know we can't be - // in the undefined mode. We can return immediately. - if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) - return false; - - // If runtime is required then we need to check is in - // SPMD mode or not. If not in SPMD mode then we end - // up in the UNDEFINED state that marks the orphaned - // functions. - if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) - return true; - - // Check if we are in an UNDEFINED state. Undefined is denoted by - // non-SPMD + noRuntimeRequired which is a combination that - // cannot actually happen. Undefined states is used to mark orphaned - // functions. - return isRuntimeUninitialized(); -} - -DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) { - return !checkRuntimeUninitialized(loc); -} - -//////////////////////////////////////////////////////////////////////////////// -// support: get info from machine -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// -// Calls to the Generic Scheme Implementation Layer (assuming 1D layout) -// -//////////////////////////////////////////////////////////////////////////////// - -// The master thread id is the first thread (lane) of the last warp. -// Thread id is 0 indexed. -// E.g: If NumThreads is 33, master id is 32. -// If NumThreads is 64, master id is 32. -// If NumThreads is 97, master id is 96. -// If NumThreads is 1024, master id is 992. -// -// Called in Generic Execution Mode only. -DEVICE int GetMasterThreadID() { return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); } - -// The last warp is reserved for the master; other warps are workers. -// Called in Generic Execution Mode only. -DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } - -//////////////////////////////////////////////////////////////////////////////// -// get thread id in team - -// This function may be called in a parallel region by the workers -// or a serial region by the master. If the master (whose CUDA thread -// id is GetMasterThreadID()) calls this routine, we return 0 because -// it is a shadow for the first worker. -DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { - // Implemented using control flow (predication) instead of with a modulo - // operation. - int tid = GetThreadIdInBlock(); - if (!isSPMDExecutionMode && tid >= GetMasterThreadID()) - return 0; - else - return tid; -} - -//////////////////////////////////////////////////////////////////////////////// -// -// OpenMP Thread Support Layer -// -//////////////////////////////////////////////////////////////////////////////// - -DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { - // omp_thread_num - int rc; - if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { - rc = 0; - } else if (isSPMDExecutionMode) { - rc = GetThreadIdInBlock(); - } else { - omptarget_nvptx_TaskDescr *currTaskDescr = - omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); - ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); - rc = currTaskDescr->ThreadId(); - } - return rc; -} - -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { - // omp_num_threads - int rc; - int Level = parallelLevel[GetWarpId()]; - if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { - rc = 1; - } else if (isSPMDExecutionMode) { - rc = GetNumberOfThreadsInBlock(); - } else { - rc = threadsInTeam; - } - - return rc; -} - -//////////////////////////////////////////////////////////////////////////////// -// Team id linked to OpenMP - -DEVICE int GetOmpTeamId() { - // omp_team_num - return GetBlockIdInKernel(); // assume 1 block per team -} - -DEVICE int GetNumberOfOmpTeams() { - // omp_num_teams - return GetNumberOfBlocksInKernel(); // assume 1 block per team -} - -//////////////////////////////////////////////////////////////////////////////// -// Masters - -DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } - -//////////////////////////////////////////////////////////////////////////////// -// Parallel level - -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { - __kmpc_impl_syncwarp(Mask); - __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); - if (Rank == 0) { - parallelLevel[GetWarpId()] += - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __kmpc_impl_threadfence(); - } - __kmpc_impl_syncwarp(Mask); -} - -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { - __kmpc_impl_syncwarp(Mask); - __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); - unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); - if (Rank == 0) { - parallelLevel[GetWarpId()] -= - (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); - __kmpc_impl_threadfence(); - } - __kmpc_impl_syncwarp(Mask); -} - -//////////////////////////////////////////////////////////////////////////////// -// get OpenMP number of procs - -// Get the number of processors in the device. -DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { - if (!isSPMDExecutionMode) - return GetNumberOfWorkersInTeam(); - return GetNumberOfThreadsInBlock(); -} - -DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { - return GetNumberOfProcsInDevice(isSPMDExecutionMode); -} - -//////////////////////////////////////////////////////////////////////////////// -// Memory -//////////////////////////////////////////////////////////////////////////////// - -DEVICE unsigned long PadBytes(unsigned long size, - unsigned long alignment) // must be a power of 2 -{ - // compute the necessary padding to satisfy alignment constraint - ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, - "alignment %lu is not a power of 2\n", alignment); - return (~(unsigned long)size + 1) & (alignment - 1); -} - -DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success -{ - void *ptr = __kmpc_impl_malloc(size); - PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n", - (unsigned long long)size, msg, (unsigned long long)ptr); - return ptr; -} - -DEVICE void *SafeFree(void *ptr, const char *msg) { - PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg); - __kmpc_impl_free(ptr); - return NULL; -} - -//////////////////////////////////////////////////////////////////////////////// -// Teams Reduction Scratchpad Helpers -//////////////////////////////////////////////////////////////////////////////// - -DEVICE unsigned int *GetTeamsReductionTimestamp() { - return static_cast(ReductionScratchpadPtr); -} - -DEVICE char *GetTeamsReductionScratchpad() { - return static_cast(ReductionScratchpadPtr) + 256; -} - -DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) { - ReductionScratchpadPtr = ScratchpadPtr; -} +//===--------- support.cu - GPU OpenMP support functions --------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Wrapper implementation to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +#include "common/support.h" +#include "common/debug.h" +#include "common/omptarget.h" + +//////////////////////////////////////////////////////////////////////////////// +// Execution Parameters +//////////////////////////////////////////////////////////////////////////////// + +DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) { + execution_param = EMode; + execution_param |= RMode; +} + +DEVICE bool isGenericMode() { return (execution_param & ModeMask) == Generic; } + +DEVICE bool isSPMDMode() { return (execution_param & ModeMask) == Spmd; } + +DEVICE bool isRuntimeUninitialized() { + return (execution_param & RuntimeMask) == RuntimeUninitialized; +} + +DEVICE bool isRuntimeInitialized() { + return (execution_param & RuntimeMask) == RuntimeInitialized; +} + +//////////////////////////////////////////////////////////////////////////////// +// Execution Modes based on location parameter fields +//////////////////////////////////////////////////////////////////////////////// + +DEVICE bool checkSPMDMode(kmp_Ident *loc) { + if (!loc) + return isSPMDMode(); + + // If SPMD is true then we are not in the UNDEFINED state so + // we can return immediately. + if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) + return true; + + // If not in SPMD mode and runtime required is a valid + // combination of flags so we can return immediately. + if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) + return false; + + // We are in underfined state. + return isSPMDMode(); +} + +DEVICE bool checkGenericMode(kmp_Ident *loc) { + return !checkSPMDMode(loc); +} + +DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc) { + if (!loc) + return isRuntimeUninitialized(); + + // If runtime is required then we know we can't be + // in the undefined mode. We can return immediately. + if (!(loc->reserved_2 & KMP_IDENT_SIMPLE_RT_MODE)) + return false; + + // If runtime is required then we need to check is in + // SPMD mode or not. If not in SPMD mode then we end + // up in the UNDEFINED state that marks the orphaned + // functions. + if (loc->reserved_2 & KMP_IDENT_SPMD_MODE) + return true; + + // Check if we are in an UNDEFINED state. Undefined is denoted by + // non-SPMD + noRuntimeRequired which is a combination that + // cannot actually happen. Undefined states is used to mark orphaned + // functions. + return isRuntimeUninitialized(); +} + +DEVICE bool checkRuntimeInitialized(kmp_Ident *loc) { + return !checkRuntimeUninitialized(loc); +} + +//////////////////////////////////////////////////////////////////////////////// +// support: get info from machine +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// +// Calls to the Generic Scheme Implementation Layer (assuming 1D layout) +// +//////////////////////////////////////////////////////////////////////////////// + +// The master thread id is the first thread (lane) of the last warp. +// Thread id is 0 indexed. +// E.g: If NumThreads is 33, master id is 32. +// If NumThreads is 64, master id is 32. +// If NumThreads is 97, master id is 96. +// If NumThreads is 1024, master id is 992. +// +// Called in Generic Execution Mode only. +DEVICE int GetMasterThreadID() { return (GetNumberOfThreadsInBlock() - 1) & ~(WARPSIZE - 1); } + +// The last warp is reserved for the master; other warps are workers. +// Called in Generic Execution Mode only. +DEVICE int GetNumberOfWorkersInTeam() { return GetMasterThreadID(); } + +//////////////////////////////////////////////////////////////////////////////// +// get thread id in team + +// This function may be called in a parallel region by the workers +// or a serial region by the master. If the master (whose CUDA thread +// id is GetMasterThreadID()) calls this routine, we return 0 because +// it is a shadow for the first worker. +DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode) { + // Implemented using control flow (predication) instead of with a modulo + // operation. + int tid = GetThreadIdInBlock(); + if (!isSPMDExecutionMode && tid >= GetMasterThreadID()) + return 0; + else + return tid; +} + +//////////////////////////////////////////////////////////////////////////////// +// +// OpenMP Thread Support Layer +// +//////////////////////////////////////////////////////////////////////////////// + +DEVICE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) { + // omp_thread_num + int rc; + if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) { + rc = 0; + } else if (isSPMDExecutionMode) { + rc = GetThreadIdInBlock(); + } else { + omptarget_nvptx_TaskDescr *currTaskDescr = + omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId); + ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr"); + rc = currTaskDescr->ThreadId(); + } + return rc; +} + +DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) { + // omp_num_threads + int rc; + int Level = parallelLevel[GetWarpId()]; + if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) { + rc = 1; + } else if (isSPMDExecutionMode) { + rc = GetNumberOfThreadsInBlock(); + } else { + rc = threadsInTeam; + } + + return rc; +} + +//////////////////////////////////////////////////////////////////////////////// +// Team id linked to OpenMP + +DEVICE int GetOmpTeamId() { + // omp_team_num + return GetBlockIdInKernel(); // assume 1 block per team +} + +DEVICE int GetNumberOfOmpTeams() { + // omp_num_teams + return GetNumberOfBlocksInKernel(); // assume 1 block per team +} + +//////////////////////////////////////////////////////////////////////////////// +// Masters + +DEVICE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); } + +//////////////////////////////////////////////////////////////////////////////// +// Parallel level + +DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { + __kmpc_impl_syncwarp(Mask); + __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); + unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); + if (Rank == 0) { + parallelLevel[GetWarpId()] += + (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __kmpc_impl_threadfence(); + } + __kmpc_impl_syncwarp(Mask); +} + +DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask) { + __kmpc_impl_syncwarp(Mask); + __kmpc_impl_lanemask_t LaneMaskLt = __kmpc_impl_lanemask_lt(); + unsigned Rank = __kmpc_impl_popc(Mask & LaneMaskLt); + if (Rank == 0) { + parallelLevel[GetWarpId()] -= + (1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0)); + __kmpc_impl_threadfence(); + } + __kmpc_impl_syncwarp(Mask); +} + +//////////////////////////////////////////////////////////////////////////////// +// get OpenMP number of procs + +// Get the number of processors in the device. +DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode) { + if (!isSPMDExecutionMode) + return GetNumberOfWorkersInTeam(); + return GetNumberOfThreadsInBlock(); +} + +DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode) { + return GetNumberOfProcsInDevice(isSPMDExecutionMode); +} + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + +DEVICE unsigned long PadBytes(unsigned long size, + unsigned long alignment) // must be a power of 2 +{ + // compute the necessary padding to satisfy alignment constraint + ASSERT(LT_FUSSY, (alignment & (alignment - 1)) == 0, + "alignment %lu is not a power of 2\n", alignment); + return (~(unsigned long)size + 1) & (alignment - 1); +} + +DEVICE void *SafeMalloc(size_t size, const char *msg) // check if success +{ + void *ptr = __kmpc_impl_malloc(size); + PRINT(LD_MEM, "malloc data of size %llu for %s: 0x%llx\n", + (unsigned long long)size, msg, (unsigned long long)ptr); + return ptr; +} + +DEVICE void *SafeFree(void *ptr, const char *msg) { + PRINT(LD_MEM, "free data ptr 0x%llx for %s\n", (unsigned long long)ptr, msg); + __kmpc_impl_free(ptr); + return NULL; +} + +//////////////////////////////////////////////////////////////////////////////// +// Teams Reduction Scratchpad Helpers +//////////////////////////////////////////////////////////////////////////////// + +DEVICE unsigned int *GetTeamsReductionTimestamp() { + return static_cast(ReductionScratchpadPtr); +} + +DEVICE char *GetTeamsReductionScratchpad() { + return static_cast(ReductionScratchpadPtr) + 256; +} + +DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr) { + ReductionScratchpadPtr = ScratchpadPtr; +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/sync.cu b/openmp/libomptarget/deviceRTLs/common/src/sync.cu index 2ac3e3f9c7c0a..ba6c66340a764 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/sync.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/sync.cu @@ -1,155 +1,155 @@ -//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Include all synchronization. -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// KMP Ordered calls -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_ordered\n"); -} - -EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) { - PRINT0(LD_IO, "call kmpc_end_ordered\n"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP Barriers -//////////////////////////////////////////////////////////////////////////////// - -// a team is a block: we can use CUDA native synchronization mechanism -// FIXME: what if not all threads (warps) participate to the barrier? -// We may need to implement it differently - -EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) { - PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); - __kmpc_barrier(loc_ref, tid); - PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); - return 0; -} - -EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) { - if (checkRuntimeUninitialized(loc_ref)) { - ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref), - "Expected SPMD mode with uninitialized runtime."); - __kmpc_barrier_simple_spmd(loc_ref, tid); - } else { - tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); - int numberOfActiveOMPThreads = - GetNumberOfOmpThreads(checkSPMDMode(loc_ref)); - if (numberOfActiveOMPThreads > 1) { - if (checkSPMDMode(loc_ref)) { - __kmpc_barrier_simple_spmd(loc_ref, tid); - } else { - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier with %d omp threads, sync parameter %d\n", - (int)numberOfActiveOMPThreads, (int)threads); - // Barrier #1 is for synchronization among active threads. - __kmpc_impl_named_sync(L1_BARRIER, threads); - } - } else { - // Still need to flush the memory per the standard. - __kmpc_flush(loc_ref); - } // numberOfActiveOMPThreads > 1 - PRINT0(LD_SYNC, "completed kmpc_barrier\n"); - } -} - -// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 -// parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) { - PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); - __kmpc_impl_syncthreads(); - PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); -} - -// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0 -// parallel region and that all worker threads participate. -EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) { - int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE; - // The #threads parameter must be rounded up to the WARPSIZE. - int threads = - WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); - - PRINT(LD_SYNC, - "call kmpc_barrier_simple_generic with %d omp threads, sync parameter " - "%d\n", - (int)numberOfActiveOMPThreads, (int)threads); - // Barrier #1 is for synchronization among active threads. - __kmpc_impl_named_sync(L1_BARRIER, threads); - PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP MASTER -//////////////////////////////////////////////////////////////////////////////// - -EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_master\n"); - return IsTeamMaster(global_tid); -} - -EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_end_master\n"); - ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); -} - -//////////////////////////////////////////////////////////////////////////////// -// KMP SINGLE -//////////////////////////////////////////////////////////////////////////////// - -EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_single\n"); - // decide to implement single with master; master get the single - return IsTeamMaster(global_tid); -} - -EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) { - PRINT0(LD_IO, "call kmpc_end_single\n"); - // decide to implement single with master: master get the single - ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); - // sync barrier is explicitly called... so that is not a problem -} - -//////////////////////////////////////////////////////////////////////////////// -// Flush -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_flush(kmp_Ident *loc) { - PRINT0(LD_IO, "call kmpc_flush\n"); - __kmpc_impl_threadfence(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Vote -//////////////////////////////////////////////////////////////////////////////// - -EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() { - PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); - return __kmpc_impl_activemask(); -} - -//////////////////////////////////////////////////////////////////////////////// -// Syncwarp -//////////////////////////////////////////////////////////////////////////////// - -EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { - PRINT0(LD_IO, "call __kmpc_syncwarp\n"); - __kmpc_impl_syncwarp(Mask); -} +//===------------ sync.cu - GPU OpenMP synchronizations ---------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Include all synchronization. +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" +#include "target_impl.h" + +//////////////////////////////////////////////////////////////////////////////// +// KMP Ordered calls +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_ordered\n"); +} + +EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t tid) { + PRINT0(LD_IO, "call kmpc_end_ordered\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP Barriers +//////////////////////////////////////////////////////////////////////////////// + +// a team is a block: we can use CUDA native synchronization mechanism +// FIXME: what if not all threads (warps) participate to the barrier? +// We may need to implement it differently + +EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc_ref, int32_t tid) { + PRINT0(LD_IO, "call kmpc_cancel_barrier\n"); + __kmpc_barrier(loc_ref, tid); + PRINT0(LD_SYNC, "completed kmpc_cancel_barrier\n"); + return 0; +} + +EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) { + if (checkRuntimeUninitialized(loc_ref)) { + ASSERT0(LT_FUSSY, checkSPMDMode(loc_ref), + "Expected SPMD mode with uninitialized runtime."); + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else { + tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref)); + int numberOfActiveOMPThreads = + GetNumberOfOmpThreads(checkSPMDMode(loc_ref)); + if (numberOfActiveOMPThreads > 1) { + if (checkSPMDMode(loc_ref)) { + __kmpc_barrier_simple_spmd(loc_ref, tid); + } else { + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier with %d omp threads, sync parameter %d\n", + (int)numberOfActiveOMPThreads, (int)threads); + // Barrier #1 is for synchronization among active threads. + __kmpc_impl_named_sync(L1_BARRIER, threads); + } + } else { + // Still need to flush the memory per the standard. + __kmpc_flush(loc_ref); + } // numberOfActiveOMPThreads > 1 + PRINT0(LD_SYNC, "completed kmpc_barrier\n"); + } +} + +// Emit a simple barrier call in SPMD mode. Assumes the caller is in an L0 +// parallel region and that all worker threads participate. +EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid) { + PRINT0(LD_SYNC, "call kmpc_barrier_simple_spmd\n"); + __kmpc_impl_syncthreads(); + PRINT0(LD_SYNC, "completed kmpc_barrier_simple_spmd\n"); +} + +// Emit a simple barrier call in Generic mode. Assumes the caller is in an L0 +// parallel region and that all worker threads participate. +EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid) { + int numberOfActiveOMPThreads = GetNumberOfThreadsInBlock() - WARPSIZE; + // The #threads parameter must be rounded up to the WARPSIZE. + int threads = + WARPSIZE * ((numberOfActiveOMPThreads + WARPSIZE - 1) / WARPSIZE); + + PRINT(LD_SYNC, + "call kmpc_barrier_simple_generic with %d omp threads, sync parameter " + "%d\n", + (int)numberOfActiveOMPThreads, (int)threads); + // Barrier #1 is for synchronization among active threads. + __kmpc_impl_named_sync(L1_BARRIER, threads); + PRINT0(LD_SYNC, "completed kmpc_barrier_simple_generic\n"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP MASTER +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_master\n"); + return IsTeamMaster(global_tid); +} + +EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_end_master\n"); + ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); +} + +//////////////////////////////////////////////////////////////////////////////// +// KMP SINGLE +//////////////////////////////////////////////////////////////////////////////// + +EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_single\n"); + // decide to implement single with master; master get the single + return IsTeamMaster(global_tid); +} + +EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid) { + PRINT0(LD_IO, "call kmpc_end_single\n"); + // decide to implement single with master: master get the single + ASSERT0(LT_FUSSY, IsTeamMaster(global_tid), "expected only master here"); + // sync barrier is explicitly called... so that is not a problem +} + +//////////////////////////////////////////////////////////////////////////////// +// Flush +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_flush(kmp_Ident *loc) { + PRINT0(LD_IO, "call kmpc_flush\n"); + __kmpc_impl_threadfence(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Vote +//////////////////////////////////////////////////////////////////////////////// + +EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask() { + PRINT0(LD_IO, "call __kmpc_warp_active_thread_mask\n"); + return __kmpc_impl_activemask(); +} + +//////////////////////////////////////////////////////////////////////////////// +// Syncwarp +//////////////////////////////////////////////////////////////////////////////// + +EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t Mask) { + PRINT0(LD_IO, "call __kmpc_syncwarp\n"); + __kmpc_impl_syncwarp(Mask); +} diff --git a/openmp/libomptarget/deviceRTLs/common/src/task.cu b/openmp/libomptarget/deviceRTLs/common/src/task.cu index 5e5bc350d2775..0c11d3b4f9dbc 100644 --- a/openmp/libomptarget/deviceRTLs/common/src/task.cu +++ b/openmp/libomptarget/deviceRTLs/common/src/task.cu @@ -1,216 +1,216 @@ -//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Task implementation support. -// -// explicit task structure uses -// omptarget_nvptx task -// kmp_task -// -// where kmp_task is -// - klegacy_TaskDescr <- task pointer -// shared -> X -// routine -// part_id -// descr -// - private (of size given by task_alloc call). Accessed by -// task+sizeof(klegacy_TaskDescr) -// * private data * -// - shared: X. Accessed by shared ptr in klegacy_TaskDescr -// * pointer table to shared variables * -// - end -// -//===----------------------------------------------------------------------===// - -#include "common/omptarget.h" - -EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( - kmp_Ident *loc, // unused - uint32_t global_tid, // unused - int32_t flag, // unused (because in our impl, all are immediately exec - size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable, - kmp_TaskFctPtr taskSub) { - PRINT(LD_IO, - "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, " - "fct 0x%llx)\n", - (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable, - (unsigned long long)taskSub); - // want task+priv to be a multiple of 8 bytes - size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *)); - sizeOfTaskInclPrivate += padForTaskInclPriv; - size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable; - ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, - "need task descr of size %d to be a multiple of %d\n", - (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *)); - size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize; - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc( - totSize, "explicit task descriptor"); - kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr; - ASSERT0(LT_FUSSY, - (uint64_t)newKmpTaskDescr == - (uint64_t)ADD_BYTES(newExplicitTaskDescr, - sizeof(omptarget_nvptx_TaskDescr)), - "bad size assumptions"); - // init kmp_TaskDescr - newKmpTaskDescr->sharedPointerTable = - (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate); - newKmpTaskDescr->sub = taskSub; - newKmpTaskDescr->destructors = NULL; - PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n", - (unsigned long long)newKmpTaskDescr, - (unsigned long long)newExplicitTaskDescr); - - return newKmpTaskDescr; -} - -EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr) { - return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, - 0); -} - -EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr, - int32_t depNum, void *depList, - int32_t noAliasDepNum, - void *noAliasDepList) { - PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", - P64(newKmpTaskDescr)); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), - "Runtime must be initialized."); - // 1. get explicit task descr from kmp task descr - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( - newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); - ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, - "bad assumptions"); - omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; - ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, - "bad assumptions"); - - // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); - newTaskDescr->CopyForExplicitTask(parentTaskDescr); - // set new task descriptor as top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); - - // 3. call sub - PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", - (unsigned long long)newKmpTaskDescr->sub, - (unsigned long long)newKmpTaskDescr); - newKmpTaskDescr->sub(0, newKmpTaskDescr); - PRINT(LD_TASK, "return from call task sub 0x%llx()\n", - (unsigned long long)newKmpTaskDescr->sub); - - // 4. pop context - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, - parentTaskDescr); - // 5. free - SafeFree(newExplicitTaskDescr, "explicit task descriptor"); - return 0; -} - -EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr) { - PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", - (unsigned long long)newKmpTaskDescr); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), - "Runtime must be initialized."); - // 1. get explicit task descr from kmp task descr - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( - newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); - ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, - "bad assumptions"); - omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; - ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, - "bad assumptions"); - - // 2. push new context: update new task descriptor - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); - newTaskDescr->CopyForExplicitTask(parentTaskDescr); - // set new task descriptor as top - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); - // 3... noting to call... is inline - // 4 & 5 ... done in complete -} - -EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr) { - PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", - (unsigned long long)newKmpTaskDescr); - ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), - "Runtime must be initialized."); - // 1. get explicit task descr from kmp task descr - omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = - (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( - newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); - ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, - "bad assumptions"); - omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; - ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, - "bad assumptions"); - // 2. get parent - omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); - // 3... noting to call... is inline - // 4. pop context - int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); - omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, - parentTaskDescr); - // 5. free - SafeFree(newExplicitTaskDescr, "explicit task descriptor"); -} - -EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, - int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList) { - PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n"); - // nothing to do as all our tasks are executed as final -} - -EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n"); - // nothing to do as all our tasks are executed as final -} - -EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n"); - // nothing to do as all our tasks are executed as final -} - -EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, - int end_part) { - PRINT0(LD_IO, "call to __kmpc_taskyield()\n"); - // do nothing: tasks are executed immediately, no yielding allowed - return 0; -} - -EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) { - PRINT0(LD_IO, "call to __kmpc_taskwait()\n"); - // nothing to do as all our tasks are executed as final - return 0; -} - -EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr, int if_val, - uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, - int32_t sched, uint64_t grainsize, void *task_dup) { - - // skip task entirely if empty iteration space - if (*lb > *ub) - return; - - // the compiler has already stored lb and ub in the kmp_TaskDescr structure - // as we are using a single task to execute the entire loop, we can leave - // the initial task_t untouched - - __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); -} +//===------------- task.h - NVPTX OpenMP tasks support ----------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Task implementation support. +// +// explicit task structure uses +// omptarget_nvptx task +// kmp_task +// +// where kmp_task is +// - klegacy_TaskDescr <- task pointer +// shared -> X +// routine +// part_id +// descr +// - private (of size given by task_alloc call). Accessed by +// task+sizeof(klegacy_TaskDescr) +// * private data * +// - shared: X. Accessed by shared ptr in klegacy_TaskDescr +// * pointer table to shared variables * +// - end +// +//===----------------------------------------------------------------------===// + +#include "common/omptarget.h" + +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc( + kmp_Ident *loc, // unused + uint32_t global_tid, // unused + int32_t flag, // unused (because in our impl, all are immediately exec + size_t sizeOfTaskInclPrivate, size_t sizeOfSharedTable, + kmp_TaskFctPtr taskSub) { + PRINT(LD_IO, + "call __kmpc_omp_task_alloc(size priv&struct %lld, shared %lld, " + "fct 0x%llx)\n", + (long long)sizeOfTaskInclPrivate, (long long)sizeOfSharedTable, + (unsigned long long)taskSub); + // want task+priv to be a multiple of 8 bytes + size_t padForTaskInclPriv = PadBytes(sizeOfTaskInclPrivate, sizeof(void *)); + sizeOfTaskInclPrivate += padForTaskInclPriv; + size_t kmpSize = sizeOfTaskInclPrivate + sizeOfSharedTable; + ASSERT(LT_FUSSY, sizeof(omptarget_nvptx_TaskDescr) % sizeof(void *) == 0, + "need task descr of size %d to be a multiple of %d\n", + (int)sizeof(omptarget_nvptx_TaskDescr), (int)sizeof(void *)); + size_t totSize = sizeof(omptarget_nvptx_TaskDescr) + kmpSize; + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SafeMalloc( + totSize, "explicit task descriptor"); + kmp_TaskDescr *newKmpTaskDescr = &newExplicitTaskDescr->kmpTaskDescr; + ASSERT0(LT_FUSSY, + (uint64_t)newKmpTaskDescr == + (uint64_t)ADD_BYTES(newExplicitTaskDescr, + sizeof(omptarget_nvptx_TaskDescr)), + "bad size assumptions"); + // init kmp_TaskDescr + newKmpTaskDescr->sharedPointerTable = + (void *)((char *)newKmpTaskDescr + sizeOfTaskInclPrivate); + newKmpTaskDescr->sub = taskSub; + newKmpTaskDescr->destructors = NULL; + PRINT(LD_TASK, "return with task descr kmp: 0x%llx, omptarget-nvptx 0x%llx\n", + (unsigned long long)newKmpTaskDescr, + (unsigned long long)newExplicitTaskDescr); + + return newKmpTaskDescr; +} + +EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + return __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, + 0); +} + +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList) { + PRINT(LD_IO, "call to __kmpc_omp_task_with_deps(task 0x%llx)\n", + P64(newKmpTaskDescr)); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Runtime must be initialized."); + // 1. get explicit task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); + + // 3. call sub + PRINT(LD_TASK, "call task sub 0x%llx(task descr 0x%llx)\n", + (unsigned long long)newKmpTaskDescr->sub, + (unsigned long long)newKmpTaskDescr); + newKmpTaskDescr->sub(0, newKmpTaskDescr); + PRINT(LD_TASK, "return from call task sub 0x%llx()\n", + (unsigned long long)newKmpTaskDescr->sub); + + // 4. pop context + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, + parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); + return 0; +} + +EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + PRINT(LD_IO, "call to __kmpc_omp_task_begin_if0(task 0x%llx)\n", + (unsigned long long)newKmpTaskDescr); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Runtime must be initialized."); + // 1. get explicit task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + + // 2. push new context: update new task descriptor + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_TaskDescr *parentTaskDescr = getMyTopTaskDescriptor(tid); + newTaskDescr->CopyForExplicitTask(parentTaskDescr); + // set new task descriptor as top + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, newTaskDescr); + // 3... noting to call... is inline + // 4 & 5 ... done in complete +} + +EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr) { + PRINT(LD_IO, "call to __kmpc_omp_task_complete_if0(task 0x%llx)\n", + (unsigned long long)newKmpTaskDescr); + ASSERT0(LT_FUSSY, checkRuntimeInitialized(loc), + "Runtime must be initialized."); + // 1. get explicit task descr from kmp task descr + omptarget_nvptx_ExplicitTaskDescr *newExplicitTaskDescr = + (omptarget_nvptx_ExplicitTaskDescr *)SUB_BYTES( + newKmpTaskDescr, sizeof(omptarget_nvptx_TaskDescr)); + ASSERT0(LT_FUSSY, &newExplicitTaskDescr->kmpTaskDescr == newKmpTaskDescr, + "bad assumptions"); + omptarget_nvptx_TaskDescr *newTaskDescr = &newExplicitTaskDescr->taskDescr; + ASSERT0(LT_FUSSY, (uint64_t)newTaskDescr == (uint64_t)newExplicitTaskDescr, + "bad assumptions"); + // 2. get parent + omptarget_nvptx_TaskDescr *parentTaskDescr = newTaskDescr->GetPrevTaskDescr(); + // 3... noting to call... is inline + // 4. pop context + int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); + omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(tid, + parentTaskDescr); + // 5. free + SafeFree(newExplicitTaskDescr, "explicit task descriptor"); +} + +EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + PRINT0(LD_IO, "call to __kmpc_omp_wait_deps(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_end_taskgroup(..)\n"); + // nothing to do as all our tasks are executed as final +} + +EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, + int end_part) { + PRINT0(LD_IO, "call to __kmpc_taskyield()\n"); + // do nothing: tasks are executed immediately, no yielding allowed + return 0; +} + +EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid) { + PRINT0(LD_IO, "call to __kmpc_taskwait()\n"); + // nothing to do as all our tasks are executed as final + return 0; +} + +EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, int if_val, + uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, + int32_t sched, uint64_t grainsize, void *task_dup) { + + // skip task entirely if empty iteration space + if (*lb > *ub) + return; + + // the compiler has already stored lb and ub in the kmp_TaskDescr structure + // as we are using a single task to execute the entire loop, we can leave + // the initial task_t untouched + + __kmpc_omp_task_with_deps(loc, global_tid, newKmpTaskDescr, 0, 0, 0, 0); +} diff --git a/openmp/libomptarget/deviceRTLs/common/state-queue.h b/openmp/libomptarget/deviceRTLs/common/state-queue.h index 8320929cfaf3a..7884d7cbd0df6 100644 --- a/openmp/libomptarget/deviceRTLs/common/state-queue.h +++ b/openmp/libomptarget/deviceRTLs/common/state-queue.h @@ -1,51 +1,51 @@ -//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains a queue to hand out OpenMP state objects to teams of -// one or more kernels. -// -// Reference: -// Thomas R.W. Scogland and Wu-chun Feng. 2015. -// Design and Evaluation of Scalable Concurrent Queues for Many-Core -// Architectures. International Conference on Performance Engineering. -// -//===----------------------------------------------------------------------===// - -#ifndef __STATE_QUEUE_H -#define __STATE_QUEUE_H - -#include - -#include "target_impl.h" - -template class omptarget_nvptx_Queue { -private: - ElementType elements[SIZE]; - volatile ElementType *elementQueue[SIZE]; - volatile uint32_t head; - volatile uint32_t ids[SIZE]; - volatile uint32_t tail; - - static const uint32_t MAX_ID = (1u << 31) / SIZE / 2; - INLINE uint32_t ENQUEUE_TICKET(); - INLINE uint32_t DEQUEUE_TICKET(); - INLINE static uint32_t ID(uint32_t ticket); - INLINE bool IsServing(uint32_t slot, uint32_t id); - INLINE void PushElement(uint32_t slot, ElementType *element); - INLINE ElementType *PopElement(uint32_t slot); - INLINE void DoneServing(uint32_t slot, uint32_t id); - -public: - INLINE omptarget_nvptx_Queue() {} - INLINE void Enqueue(ElementType *element); - INLINE ElementType *Dequeue(); -}; - -#include "state-queuei.h" - -#endif +//===--------- statequeue.h - NVPTX OpenMP GPU State Queue ------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains a queue to hand out OpenMP state objects to teams of +// one or more kernels. +// +// Reference: +// Thomas R.W. Scogland and Wu-chun Feng. 2015. +// Design and Evaluation of Scalable Concurrent Queues for Many-Core +// Architectures. International Conference on Performance Engineering. +// +//===----------------------------------------------------------------------===// + +#ifndef __STATE_QUEUE_H +#define __STATE_QUEUE_H + +#include + +#include "target_impl.h" + +template class omptarget_nvptx_Queue { +private: + ElementType elements[SIZE]; + volatile ElementType *elementQueue[SIZE]; + volatile uint32_t head; + volatile uint32_t ids[SIZE]; + volatile uint32_t tail; + + static const uint32_t MAX_ID = (1u << 31) / SIZE / 2; + INLINE uint32_t ENQUEUE_TICKET(); + INLINE uint32_t DEQUEUE_TICKET(); + INLINE static uint32_t ID(uint32_t ticket); + INLINE bool IsServing(uint32_t slot, uint32_t id); + INLINE void PushElement(uint32_t slot, ElementType *element); + INLINE ElementType *PopElement(uint32_t slot); + INLINE void DoneServing(uint32_t slot, uint32_t id); + +public: + INLINE omptarget_nvptx_Queue() {} + INLINE void Enqueue(ElementType *element); + INLINE ElementType *Dequeue(); +}; + +#include "state-queuei.h" + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/state-queuei.h b/openmp/libomptarget/deviceRTLs/common/state-queuei.h index 1bd261f2826ac..5c14f9aad2939 100644 --- a/openmp/libomptarget/deviceRTLs/common/state-queuei.h +++ b/openmp/libomptarget/deviceRTLs/common/state-queuei.h @@ -1,90 +1,90 @@ -//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains the implementation of a queue to hand out OpenMP state -// objects to teams of one or more kernels. -// -// Reference: -// Thomas R.W. Scogland and Wu-chun Feng. 2015. -// Design and Evaluation of Scalable Concurrent Queues for Many-Core -// Architectures. International Conference on Performance Engineering. -// -//===----------------------------------------------------------------------===// - -#include "state-queue.h" -#include "common/target_atomic.h" - -template -INLINE uint32_t omptarget_nvptx_Queue::ENQUEUE_TICKET() { - return __kmpc_atomic_add((unsigned int *)&tail, 1u); -} - -template -INLINE uint32_t omptarget_nvptx_Queue::DEQUEUE_TICKET() { - return __kmpc_atomic_add((unsigned int *)&head, 1u); -} - -template -INLINE uint32_t -omptarget_nvptx_Queue::ID(uint32_t ticket) { - return (ticket / SIZE) * 2; -} - -template -INLINE bool omptarget_nvptx_Queue::IsServing(uint32_t slot, - uint32_t id) { - return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id; -} - -template -INLINE void -omptarget_nvptx_Queue::PushElement(uint32_t slot, - ElementType *element) { - __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot], - (unsigned long long)element); -} - -template -INLINE ElementType * -omptarget_nvptx_Queue::PopElement(uint32_t slot) { - return (ElementType *)__kmpc_atomic_add( - (unsigned long long *)&elementQueue[slot], (unsigned long long)0); -} - -template -INLINE void omptarget_nvptx_Queue::DoneServing(uint32_t slot, - uint32_t id) { - __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID); -} - -template -INLINE void -omptarget_nvptx_Queue::Enqueue(ElementType *element) { - uint32_t ticket = ENQUEUE_TICKET(); - uint32_t slot = ticket % SIZE; - uint32_t id = ID(ticket) + 1; - while (!IsServing(slot, id)) - ; - PushElement(slot, element); - DoneServing(slot, id); -} - -template -INLINE ElementType *omptarget_nvptx_Queue::Dequeue() { - uint32_t ticket = DEQUEUE_TICKET(); - uint32_t slot = ticket % SIZE; - uint32_t id = ID(ticket); - while (!IsServing(slot, id)) - ; - ElementType *element = PopElement(slot); - // This is to populate the queue because of the lack of GPU constructors. - if (element == 0) - element = &elements[slot]; - DoneServing(slot, id); - return element; -} +//===------- state-queuei.h - OpenMP GPU State Queue ------------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains the implementation of a queue to hand out OpenMP state +// objects to teams of one or more kernels. +// +// Reference: +// Thomas R.W. Scogland and Wu-chun Feng. 2015. +// Design and Evaluation of Scalable Concurrent Queues for Many-Core +// Architectures. International Conference on Performance Engineering. +// +//===----------------------------------------------------------------------===// + +#include "state-queue.h" +#include "common/target_atomic.h" + +template +INLINE uint32_t omptarget_nvptx_Queue::ENQUEUE_TICKET() { + return __kmpc_atomic_add((unsigned int *)&tail, 1u); +} + +template +INLINE uint32_t omptarget_nvptx_Queue::DEQUEUE_TICKET() { + return __kmpc_atomic_add((unsigned int *)&head, 1u); +} + +template +INLINE uint32_t +omptarget_nvptx_Queue::ID(uint32_t ticket) { + return (ticket / SIZE) * 2; +} + +template +INLINE bool omptarget_nvptx_Queue::IsServing(uint32_t slot, + uint32_t id) { + return __kmpc_atomic_add((unsigned int *)&ids[slot], 0u) == id; +} + +template +INLINE void +omptarget_nvptx_Queue::PushElement(uint32_t slot, + ElementType *element) { + __kmpc_atomic_exchange((unsigned long long *)&elementQueue[slot], + (unsigned long long)element); +} + +template +INLINE ElementType * +omptarget_nvptx_Queue::PopElement(uint32_t slot) { + return (ElementType *)__kmpc_atomic_add( + (unsigned long long *)&elementQueue[slot], (unsigned long long)0); +} + +template +INLINE void omptarget_nvptx_Queue::DoneServing(uint32_t slot, + uint32_t id) { + __kmpc_atomic_exchange((unsigned int *)&ids[slot], (id + 1) % MAX_ID); +} + +template +INLINE void +omptarget_nvptx_Queue::Enqueue(ElementType *element) { + uint32_t ticket = ENQUEUE_TICKET(); + uint32_t slot = ticket % SIZE; + uint32_t id = ID(ticket) + 1; + while (!IsServing(slot, id)) + ; + PushElement(slot, element); + DoneServing(slot, id); +} + +template +INLINE ElementType *omptarget_nvptx_Queue::Dequeue() { + uint32_t ticket = DEQUEUE_TICKET(); + uint32_t slot = ticket % SIZE; + uint32_t id = ID(ticket); + while (!IsServing(slot, id)) + ; + ElementType *element = PopElement(slot); + // This is to populate the queue because of the lack of GPU constructors. + if (element == 0) + element = &elements[slot]; + DoneServing(slot, id); + return element; +} diff --git a/openmp/libomptarget/deviceRTLs/common/support.h b/openmp/libomptarget/deviceRTLs/common/support.h index 913c4c3c323fc..6dfb8e44c24ea 100644 --- a/openmp/libomptarget/deviceRTLs/common/support.h +++ b/openmp/libomptarget/deviceRTLs/common/support.h @@ -1,99 +1,99 @@ -//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Wrapper to some functions natively supported by the GPU. -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_SUPPORT_H -#define OMPTARGET_SUPPORT_H - -#include "interface.h" -#include "target_impl.h" - -//////////////////////////////////////////////////////////////////////////////// -// Execution Parameters -//////////////////////////////////////////////////////////////////////////////// -enum ExecutionMode { - Spmd = 0x00u, - Generic = 0x01u, - ModeMask = 0x01u, -}; - -enum RuntimeMode { - RuntimeInitialized = 0x00u, - RuntimeUninitialized = 0x02u, - RuntimeMask = 0x02u, -}; - -DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); -DEVICE bool isGenericMode(); -DEVICE bool isSPMDMode(); -DEVICE bool isRuntimeUninitialized(); -DEVICE bool isRuntimeInitialized(); - -//////////////////////////////////////////////////////////////////////////////// -// Execution Modes based on location parameter fields -//////////////////////////////////////////////////////////////////////////////// - -DEVICE bool checkSPMDMode(kmp_Ident *loc); -DEVICE bool checkGenericMode(kmp_Ident *loc); -DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc); -DEVICE bool checkRuntimeInitialized(kmp_Ident *loc); - -//////////////////////////////////////////////////////////////////////////////// -// get info from machine -//////////////////////////////////////////////////////////////////////////////// - -// get global ids to locate tread/team info (constant regardless of OMP) -DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); -DEVICE int GetMasterThreadID(); -DEVICE int GetNumberOfWorkersInTeam(); - -// get OpenMP thread and team ids -DEVICE int GetOmpThreadId(int threadId, - bool isSPMDExecutionMode); // omp_thread_num -DEVICE int GetOmpTeamId(); // omp_team_num - -// get OpenMP number of threads and team -DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads -DEVICE int GetNumberOfOmpTeams(); // omp_num_teams - -// get OpenMP number of procs -DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); -DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); - -// masters -DEVICE int IsTeamMaster(int ompThreadId); - -// Parallel level -DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); -DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); - -//////////////////////////////////////////////////////////////////////////////// -// Memory -//////////////////////////////////////////////////////////////////////////////// - -// safe alloc and free -DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success -DEVICE void *SafeFree(void *ptr, const char *msg); -// pad to a alignment (power of 2 only) -DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment); -#define ADD_BYTES(_addr, _bytes) \ - ((void *)((char *)((void *)(_addr)) + (_bytes))) -#define SUB_BYTES(_addr, _bytes) \ - ((void *)((char *)((void *)(_addr)) - (_bytes))) - -//////////////////////////////////////////////////////////////////////////////// -// Teams Reduction Scratchpad Helpers -//////////////////////////////////////////////////////////////////////////////// -DEVICE unsigned int *GetTeamsReductionTimestamp(); -DEVICE char *GetTeamsReductionScratchpad(); -DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr); - -#endif +//===--------- support.h - OpenMP GPU support functions ---------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Wrapper to some functions natively supported by the GPU. +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_SUPPORT_H +#define OMPTARGET_SUPPORT_H + +#include "interface.h" +#include "target_impl.h" + +//////////////////////////////////////////////////////////////////////////////// +// Execution Parameters +//////////////////////////////////////////////////////////////////////////////// +enum ExecutionMode { + Spmd = 0x00u, + Generic = 0x01u, + ModeMask = 0x01u, +}; + +enum RuntimeMode { + RuntimeInitialized = 0x00u, + RuntimeUninitialized = 0x02u, + RuntimeMask = 0x02u, +}; + +DEVICE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode); +DEVICE bool isGenericMode(); +DEVICE bool isSPMDMode(); +DEVICE bool isRuntimeUninitialized(); +DEVICE bool isRuntimeInitialized(); + +//////////////////////////////////////////////////////////////////////////////// +// Execution Modes based on location parameter fields +//////////////////////////////////////////////////////////////////////////////// + +DEVICE bool checkSPMDMode(kmp_Ident *loc); +DEVICE bool checkGenericMode(kmp_Ident *loc); +DEVICE bool checkRuntimeUninitialized(kmp_Ident *loc); +DEVICE bool checkRuntimeInitialized(kmp_Ident *loc); + +//////////////////////////////////////////////////////////////////////////////// +// get info from machine +//////////////////////////////////////////////////////////////////////////////// + +// get global ids to locate tread/team info (constant regardless of OMP) +DEVICE int GetLogicalThreadIdInBlock(bool isSPMDExecutionMode); +DEVICE int GetMasterThreadID(); +DEVICE int GetNumberOfWorkersInTeam(); + +// get OpenMP thread and team ids +DEVICE int GetOmpThreadId(int threadId, + bool isSPMDExecutionMode); // omp_thread_num +DEVICE int GetOmpTeamId(); // omp_team_num + +// get OpenMP number of threads and team +DEVICE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads +DEVICE int GetNumberOfOmpTeams(); // omp_num_teams + +// get OpenMP number of procs +DEVICE int GetNumberOfProcsInTeam(bool isSPMDExecutionMode); +DEVICE int GetNumberOfProcsInDevice(bool isSPMDExecutionMode); + +// masters +DEVICE int IsTeamMaster(int ompThreadId); + +// Parallel level +DEVICE void IncParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); +DEVICE void DecParallelLevel(bool ActiveParallel, __kmpc_impl_lanemask_t Mask); + +//////////////////////////////////////////////////////////////////////////////// +// Memory +//////////////////////////////////////////////////////////////////////////////// + +// safe alloc and free +DEVICE void *SafeMalloc(size_t size, const char *msg); // check if success +DEVICE void *SafeFree(void *ptr, const char *msg); +// pad to a alignment (power of 2 only) +DEVICE unsigned long PadBytes(unsigned long size, unsigned long alignment); +#define ADD_BYTES(_addr, _bytes) \ + ((void *)((char *)((void *)(_addr)) + (_bytes))) +#define SUB_BYTES(_addr, _bytes) \ + ((void *)((char *)((void *)(_addr)) - (_bytes))) + +//////////////////////////////////////////////////////////////////////////////// +// Teams Reduction Scratchpad Helpers +//////////////////////////////////////////////////////////////////////////////// +DEVICE unsigned int *GetTeamsReductionTimestamp(); +DEVICE char *GetTeamsReductionScratchpad(); +DEVICE void SetTeamsReductionScratchpadPtr(void *ScratchpadPtr); + +#endif diff --git a/openmp/libomptarget/deviceRTLs/common/target_atomic.h b/openmp/libomptarget/deviceRTLs/common/target_atomic.h index 3c905d3cbbf2d..8fd96451790b6 100644 --- a/openmp/libomptarget/deviceRTLs/common/target_atomic.h +++ b/openmp/libomptarget/deviceRTLs/common/target_atomic.h @@ -1,38 +1,38 @@ -//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations of atomic functions provided by each target -// -//===----------------------------------------------------------------------===// - -#ifndef OMPTARGET_TARGET_ATOMIC_H -#define OMPTARGET_TARGET_ATOMIC_H - -#include "target_impl.h" - -template INLINE T __kmpc_atomic_add(T *address, T val) { - return atomicAdd(address, val); -} - -template INLINE T __kmpc_atomic_inc(T *address, T val) { - return atomicInc(address, val); -} - -template INLINE T __kmpc_atomic_max(T *address, T val) { - return atomicMax(address, val); -} - -template INLINE T __kmpc_atomic_exchange(T *address, T val) { - return atomicExch(address, val); -} - -template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { - return atomicCAS(address, compare, val); -} - -#endif +//===---- target_atomic.h - OpenMP GPU target atomic functions ---- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declarations of atomic functions provided by each target +// +//===----------------------------------------------------------------------===// + +#ifndef OMPTARGET_TARGET_ATOMIC_H +#define OMPTARGET_TARGET_ATOMIC_H + +#include "target_impl.h" + +template INLINE T __kmpc_atomic_add(T *address, T val) { + return atomicAdd(address, val); +} + +template INLINE T __kmpc_atomic_inc(T *address, T val) { + return atomicInc(address, val); +} + +template INLINE T __kmpc_atomic_max(T *address, T val) { + return atomicMax(address, val); +} + +template INLINE T __kmpc_atomic_exchange(T *address, T val) { + return atomicExch(address, val); +} + +template INLINE T __kmpc_atomic_cas(T *address, T compare, T val) { + return atomicCAS(address, compare, val); +} + +#endif diff --git a/openmp/libomptarget/deviceRTLs/interface.h b/openmp/libomptarget/deviceRTLs/interface.h index 3c216a5e61c5e..c6d6b55f17d98 100644 --- a/openmp/libomptarget/deviceRTLs/interface.h +++ b/openmp/libomptarget/deviceRTLs/interface.h @@ -1,542 +1,542 @@ -//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file contains all the definitions that are relevant to -// the interface. The first section contains the interface as -// declared by OpenMP. The second section includes the compiler -// specific interfaces. -// -//===----------------------------------------------------------------------===// - -#ifndef _INTERFACES_H_ -#define _INTERFACES_H_ - -#include -#include - -#ifdef __AMDGCN__ -#include "amdgcn/src/amdgcn_interface.h" -#endif -#ifdef __CUDACC__ -#include "nvptx/src/nvptx_interface.h" -#endif - -//////////////////////////////////////////////////////////////////////////////// -// OpenMP interface -//////////////////////////////////////////////////////////////////////////////// - -typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */ - -typedef enum omp_sched_t { - omp_sched_static = 1, /* chunkSize >0 */ - omp_sched_dynamic = 2, /* chunkSize >0 */ - omp_sched_guided = 3, /* chunkSize >0 */ - omp_sched_auto = 4, /* no chunkSize */ -} omp_sched_t; - -typedef enum omp_proc_bind_t { - omp_proc_bind_false = 0, - omp_proc_bind_true = 1, - omp_proc_bind_master = 2, - omp_proc_bind_close = 3, - omp_proc_bind_spread = 4 -} omp_proc_bind_t; - -EXTERN double omp_get_wtick(void); -EXTERN double omp_get_wtime(void); - -EXTERN void omp_set_num_threads(int num); -EXTERN int omp_get_num_threads(void); -EXTERN int omp_get_max_threads(void); -EXTERN int omp_get_thread_limit(void); -EXTERN int omp_get_thread_num(void); -EXTERN int omp_get_num_procs(void); -EXTERN int omp_in_parallel(void); -EXTERN int omp_in_final(void); -EXTERN void omp_set_dynamic(int flag); -EXTERN int omp_get_dynamic(void); -EXTERN void omp_set_nested(int flag); -EXTERN int omp_get_nested(void); -EXTERN void omp_set_max_active_levels(int level); -EXTERN int omp_get_max_active_levels(void); -EXTERN int omp_get_level(void); -EXTERN int omp_get_active_level(void); -EXTERN int omp_get_ancestor_thread_num(int level); -EXTERN int omp_get_team_size(int level); - -EXTERN void omp_init_lock(omp_lock_t *lock); -EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_destroy_lock(omp_lock_t *lock); -EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_set_lock(omp_lock_t *lock); -EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock); -EXTERN void omp_unset_lock(omp_lock_t *lock); -EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock); -EXTERN int omp_test_lock(omp_lock_t *lock); -EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock); - -EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier); -EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); -EXTERN omp_proc_bind_t omp_get_proc_bind(void); -EXTERN int omp_get_cancellation(void); -EXTERN void omp_set_default_device(int deviceId); -EXTERN int omp_get_default_device(void); -EXTERN int omp_get_num_devices(void); -EXTERN int omp_get_num_teams(void); -EXTERN int omp_get_team_num(void); -EXTERN int omp_is_initial_device(void); -EXTERN int omp_get_initial_device(void); -EXTERN int omp_get_max_task_priority(void); - -//////////////////////////////////////////////////////////////////////////////// -// file below is swiped from kmpc host interface -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// kmp specific types -//////////////////////////////////////////////////////////////////////////////// - -typedef enum kmp_sched_t { - kmp_sched_static_chunk = 33, - kmp_sched_static_nochunk = 34, - kmp_sched_dynamic = 35, - kmp_sched_guided = 36, - kmp_sched_runtime = 37, - kmp_sched_auto = 38, - - kmp_sched_static_balanced_chunk = 45, - - kmp_sched_static_ordered = 65, - kmp_sched_static_nochunk_ordered = 66, - kmp_sched_dynamic_ordered = 67, - kmp_sched_guided_ordered = 68, - kmp_sched_runtime_ordered = 69, - kmp_sched_auto_ordered = 70, - - kmp_sched_distr_static_chunk = 91, - kmp_sched_distr_static_nochunk = 92, - kmp_sched_distr_static_chunk_sched_static_chunkone = 93, - - kmp_sched_default = kmp_sched_static_nochunk, - kmp_sched_unordered_first = kmp_sched_static_chunk, - kmp_sched_unordered_last = kmp_sched_auto, - kmp_sched_ordered_first = kmp_sched_static_ordered, - kmp_sched_ordered_last = kmp_sched_auto_ordered, - kmp_sched_distribute_first = kmp_sched_distr_static_chunk, - kmp_sched_distribute_last = - kmp_sched_distr_static_chunk_sched_static_chunkone, - - /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. - * Since we need to distinguish the three possible cases (no modifier, - * monotonic modifier, nonmonotonic modifier), we need separate bits for - * each modifier. The absence of monotonic does not imply nonmonotonic, - * especially since 4.5 says that the behaviour of the "no modifier" case - * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. - * - * Since we're passing a full 32 bit value, we can use a couple of high - * bits for these flags; out of paranoia we avoid the sign bit. - * - * These modifiers can be or-ed into non-static schedules by the compiler - * to pass the additional information. They will be stripped early in the - * processing in __kmp_dispatch_init when setting up schedules, so - * most of the code won't ever see schedules with these bits set. - */ - kmp_sched_modifier_monotonic = (1 << 29), - /**< Set if the monotonic schedule modifier was present */ - kmp_sched_modifier_nonmonotonic = (1 << 30), -/**< Set if the nonmonotonic schedule modifier was present */ - -#define SCHEDULE_WITHOUT_MODIFIERS(s) \ - (enum kmp_sched_t)( \ - (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) -#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) -#define SCHEDULE_HAS_NONMONOTONIC(s) \ - (((s)&kmp_sched_modifier_nonmonotonic) != 0) -#define SCHEDULE_HAS_NO_MODIFIERS(s) \ - (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ - 0) - -} kmp_sched_t; - -/*! - * Enum for accesseing the reserved_2 field of the ident_t struct below. - */ -enum { - /*! Bit set to 1 when in SPMD mode. */ - KMP_IDENT_SPMD_MODE = 0x01, - /*! Bit set to 1 when a simplified runtime is used. */ - KMP_IDENT_SIMPLE_RT_MODE = 0x02, -}; - -/*! - * The ident structure that describes a source location. - * The struct is identical to the one in the kmp.h file. - * We maintain the same data structure for compatibility. - */ -typedef int kmp_int32; -typedef struct ident { - kmp_int32 reserved_1; /**< might be used in Fortran; see above */ - kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC - identifies this union member */ - kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */ - kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */ - char const *psource; /**< String describing the source location. - The string is composed of semi-colon separated fields - which describe the source file, the function and a pair - of line numbers that delimit the construct. */ -} ident_t; - -// parallel defs -typedef ident_t kmp_Ident; -typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...); -typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData); -typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num); -typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, - int16_t lane_offset, - int16_t shortCircuit); -typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad, - int32_t index, int32_t width); -typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad, - int32_t index, int32_t width, - int32_t reduce); -typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data); - -// task defs -typedef struct kmp_TaskDescr kmp_TaskDescr; -typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr); -typedef struct kmp_TaskDescr { - void *sharedPointerTable; // ptr to a table of shared var ptrs - kmp_TaskFctPtr sub; // task subroutine - int32_t partId; // unused - kmp_TaskFctPtr destructors; // destructor of c++ first private -} kmp_TaskDescr; - -// sync defs -typedef int32_t kmp_CriticalName[8]; - -//////////////////////////////////////////////////////////////////////////////// -// external interface -//////////////////////////////////////////////////////////////////////////////// - -// parallel -EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); -EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, - int32_t num_threads); -// simd -EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid, - int32_t simd_limit); -// aee ... not supported -// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr -// microtask, ...); -EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, - uint32_t global_tid); -EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid); - -// proc bind -EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid, - int proc_bind); -EXTERN int omp_get_num_places(void); -EXTERN int omp_get_place_num_procs(int place_num); -EXTERN void omp_get_place_proc_ids(int place_num, int *ids); -EXTERN int omp_get_place_num(void); -EXTERN int omp_get_partition_num_places(void); -EXTERN void omp_get_partition_place_nums(int *place_nums); - -// for static (no chunk or chunk) -EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, - int32_t global_tid, int32_t sched, - int32_t *plastiter, - int32_t *plower, int32_t *pupper, - int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_4u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, - uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, - int32_t chunk); -EXTERN -void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, - int32_t global_tid, int32_t sched, - int32_t *plastiter, - int64_t *plower, int64_t *pupper, - int64_t *pstride, int64_t incr, - int64_t chunk); -EXTERN -void __kmpc_for_static_init_8u_simple_generic( - kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, - uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, - int64_t chunk); - -EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid); - -// for dynamic -EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int32_t lower, int32_t upper, - int32_t incr, int32_t chunk); -EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, uint32_t lower, - uint32_t upper, int32_t incr, - int32_t chunk); -EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid, - int32_t sched, int64_t lower, int64_t upper, - int64_t incr, int64_t chunk); -EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid, - int32_t sched, uint64_t lower, - uint64_t upper, int64_t incr, - int64_t chunk); - -EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, int32_t *plower, - int32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, uint32_t *plower, - uint32_t *pupper, int32_t *pstride); -EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, int64_t *plower, - int64_t *pupper, int64_t *pstride); -EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid, - int32_t *plastiter, uint64_t *plower, - uint64_t *pupper, int64_t *pstride); - -EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); - -// Support for reducing conditional lastprivate variables -EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, - int32_t global_tid, - int32_t varNum, void *array); - -// reduction -EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); -EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); -EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, - void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( - kmp_Ident *loc, int32_t global_tid, void *global_buffer, - int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, - kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, - kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, - kmp_ListGlobalFctPtr glredFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( - int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, - kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, - kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); -EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, - int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, - int32_t global_tid, - kmp_CriticalName *crit); -EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); -EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); - -// sync barrier -EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); -EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid); -EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); - -// single -EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid); - -// sync -EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid); -EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, - kmp_CriticalName *crit); -EXTERN void __kmpc_flush(kmp_Ident *loc); - -// vote -EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); -// syncwarp -EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t); - -// tasks -EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, - uint32_t global_tid, int32_t flag, - size_t sizeOfTaskInclPrivate, - size_t sizeOfSharedTable, - kmp_TaskFctPtr sub); -EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr, - int32_t depNum, void *depList, - int32_t noAliasDepNum, - void *noAliasDepList); -EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newLegacyTaskDescr); -EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, - int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList); -EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid); -EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, - int end_part); -EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid); -EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, - kmp_TaskDescr *newKmpTaskDescr, int if_val, - uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, - int32_t sched, uint64_t grainsize, void *task_dup); - -// cancel -EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal); -EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, - int32_t cancelVal); - -// non standard -EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr); -EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); -EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); -EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, - int16_t RequiresDataSharing); -EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit(); -EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); -EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, - int16_t IsOMPRuntimeInitialized); -EXTERN bool __kmpc_kernel_parallel(void **WorkFn, - int16_t IsOMPRuntimeInitialized); -EXTERN void __kmpc_kernel_end_parallel(); -EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, - int32_t *LaneSource); -EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer); -EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, - __kmpc_impl_lanemask_t Mask, - bool *IsFinal, int32_t *LaneSource, - int32_t *LaneId, int32_t *NumLanes); -EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer); - - -EXTERN void __kmpc_data_sharing_init_stack(); -EXTERN void __kmpc_data_sharing_init_stack_spmd(); -EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, - int16_t UseSharedMemory); -EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); -EXTERN void __kmpc_data_sharing_pop_stack(void *a); -EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); -EXTERN void __kmpc_end_sharing_variables(); -EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); - -// The slot used for data sharing by the master and worker threads. We use a -// complete (default size version and an incomplete one so that we allow sizes -// greater than the default). -struct __kmpc_data_sharing_slot { - __kmpc_data_sharing_slot *Next; - __kmpc_data_sharing_slot *Prev; - void *PrevSlotStackPtr; - void *DataEnd; - char Data[]; -}; -EXTERN void -__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS, - size_t InitialDataSize); -EXTERN void *__kmpc_data_sharing_environment_begin( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - size_t SharingDataSize, size_t SharingDefaultDataSize, - int16_t IsOMPRuntimeInitialized); -EXTERN void __kmpc_data_sharing_environment_end( - __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, - void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, - int32_t IsEntryPoint); - -EXTERN void * -__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, - int16_t IsOMPRuntimeInitialized); - -// SPMD execution mode interrogation function. -EXTERN int8_t __kmpc_is_spmd_exec_mode(); - -EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, - const void *buf, size_t size, - int16_t is_shared, const void **res); - -EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, - int16_t is_shared); - -#endif +//===------- interface.h - OpenMP interface definitions ---------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file contains all the definitions that are relevant to +// the interface. The first section contains the interface as +// declared by OpenMP. The second section includes the compiler +// specific interfaces. +// +//===----------------------------------------------------------------------===// + +#ifndef _INTERFACES_H_ +#define _INTERFACES_H_ + +#include +#include + +#ifdef __AMDGCN__ +#include "amdgcn/src/amdgcn_interface.h" +#endif +#ifdef __CUDACC__ +#include "nvptx/src/nvptx_interface.h" +#endif + +//////////////////////////////////////////////////////////////////////////////// +// OpenMP interface +//////////////////////////////////////////////////////////////////////////////// + +typedef uint64_t omp_nest_lock_t; /* arbitrary type of the right length */ + +typedef enum omp_sched_t { + omp_sched_static = 1, /* chunkSize >0 */ + omp_sched_dynamic = 2, /* chunkSize >0 */ + omp_sched_guided = 3, /* chunkSize >0 */ + omp_sched_auto = 4, /* no chunkSize */ +} omp_sched_t; + +typedef enum omp_proc_bind_t { + omp_proc_bind_false = 0, + omp_proc_bind_true = 1, + omp_proc_bind_master = 2, + omp_proc_bind_close = 3, + omp_proc_bind_spread = 4 +} omp_proc_bind_t; + +EXTERN double omp_get_wtick(void); +EXTERN double omp_get_wtime(void); + +EXTERN void omp_set_num_threads(int num); +EXTERN int omp_get_num_threads(void); +EXTERN int omp_get_max_threads(void); +EXTERN int omp_get_thread_limit(void); +EXTERN int omp_get_thread_num(void); +EXTERN int omp_get_num_procs(void); +EXTERN int omp_in_parallel(void); +EXTERN int omp_in_final(void); +EXTERN void omp_set_dynamic(int flag); +EXTERN int omp_get_dynamic(void); +EXTERN void omp_set_nested(int flag); +EXTERN int omp_get_nested(void); +EXTERN void omp_set_max_active_levels(int level); +EXTERN int omp_get_max_active_levels(void); +EXTERN int omp_get_level(void); +EXTERN int omp_get_active_level(void); +EXTERN int omp_get_ancestor_thread_num(int level); +EXTERN int omp_get_team_size(int level); + +EXTERN void omp_init_lock(omp_lock_t *lock); +EXTERN void omp_init_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_destroy_lock(omp_lock_t *lock); +EXTERN void omp_destroy_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_set_lock(omp_lock_t *lock); +EXTERN void omp_set_nest_lock(omp_nest_lock_t *lock); +EXTERN void omp_unset_lock(omp_lock_t *lock); +EXTERN void omp_unset_nest_lock(omp_nest_lock_t *lock); +EXTERN int omp_test_lock(omp_lock_t *lock); +EXTERN int omp_test_nest_lock(omp_nest_lock_t *lock); + +EXTERN void omp_get_schedule(omp_sched_t *kind, int *modifier); +EXTERN void omp_set_schedule(omp_sched_t kind, int modifier); +EXTERN omp_proc_bind_t omp_get_proc_bind(void); +EXTERN int omp_get_cancellation(void); +EXTERN void omp_set_default_device(int deviceId); +EXTERN int omp_get_default_device(void); +EXTERN int omp_get_num_devices(void); +EXTERN int omp_get_num_teams(void); +EXTERN int omp_get_team_num(void); +EXTERN int omp_is_initial_device(void); +EXTERN int omp_get_initial_device(void); +EXTERN int omp_get_max_task_priority(void); + +//////////////////////////////////////////////////////////////////////////////// +// file below is swiped from kmpc host interface +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// kmp specific types +//////////////////////////////////////////////////////////////////////////////// + +typedef enum kmp_sched_t { + kmp_sched_static_chunk = 33, + kmp_sched_static_nochunk = 34, + kmp_sched_dynamic = 35, + kmp_sched_guided = 36, + kmp_sched_runtime = 37, + kmp_sched_auto = 38, + + kmp_sched_static_balanced_chunk = 45, + + kmp_sched_static_ordered = 65, + kmp_sched_static_nochunk_ordered = 66, + kmp_sched_dynamic_ordered = 67, + kmp_sched_guided_ordered = 68, + kmp_sched_runtime_ordered = 69, + kmp_sched_auto_ordered = 70, + + kmp_sched_distr_static_chunk = 91, + kmp_sched_distr_static_nochunk = 92, + kmp_sched_distr_static_chunk_sched_static_chunkone = 93, + + kmp_sched_default = kmp_sched_static_nochunk, + kmp_sched_unordered_first = kmp_sched_static_chunk, + kmp_sched_unordered_last = kmp_sched_auto, + kmp_sched_ordered_first = kmp_sched_static_ordered, + kmp_sched_ordered_last = kmp_sched_auto_ordered, + kmp_sched_distribute_first = kmp_sched_distr_static_chunk, + kmp_sched_distribute_last = + kmp_sched_distr_static_chunk_sched_static_chunkone, + + /* Support for OpenMP 4.5 monotonic and nonmonotonic schedule modifiers. + * Since we need to distinguish the three possible cases (no modifier, + * monotonic modifier, nonmonotonic modifier), we need separate bits for + * each modifier. The absence of monotonic does not imply nonmonotonic, + * especially since 4.5 says that the behaviour of the "no modifier" case + * is implementation defined in 4.5, but will become "nonmonotonic" in 5.0. + * + * Since we're passing a full 32 bit value, we can use a couple of high + * bits for these flags; out of paranoia we avoid the sign bit. + * + * These modifiers can be or-ed into non-static schedules by the compiler + * to pass the additional information. They will be stripped early in the + * processing in __kmp_dispatch_init when setting up schedules, so + * most of the code won't ever see schedules with these bits set. + */ + kmp_sched_modifier_monotonic = (1 << 29), + /**< Set if the monotonic schedule modifier was present */ + kmp_sched_modifier_nonmonotonic = (1 << 30), +/**< Set if the nonmonotonic schedule modifier was present */ + +#define SCHEDULE_WITHOUT_MODIFIERS(s) \ + (enum kmp_sched_t)( \ + (s) & ~(kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) +#define SCHEDULE_HAS_MONOTONIC(s) (((s)&kmp_sched_modifier_monotonic) != 0) +#define SCHEDULE_HAS_NONMONOTONIC(s) \ + (((s)&kmp_sched_modifier_nonmonotonic) != 0) +#define SCHEDULE_HAS_NO_MODIFIERS(s) \ + (((s) & (kmp_sched_modifier_nonmonotonic | kmp_sched_modifier_monotonic)) == \ + 0) + +} kmp_sched_t; + +/*! + * Enum for accesseing the reserved_2 field of the ident_t struct below. + */ +enum { + /*! Bit set to 1 when in SPMD mode. */ + KMP_IDENT_SPMD_MODE = 0x01, + /*! Bit set to 1 when a simplified runtime is used. */ + KMP_IDENT_SIMPLE_RT_MODE = 0x02, +}; + +/*! + * The ident structure that describes a source location. + * The struct is identical to the one in the kmp.h file. + * We maintain the same data structure for compatibility. + */ +typedef int kmp_int32; +typedef struct ident { + kmp_int32 reserved_1; /**< might be used in Fortran; see above */ + kmp_int32 flags; /**< also f.flags; KMP_IDENT_xxx flags; KMP_IDENT_KMPC + identifies this union member */ + kmp_int32 reserved_2; /**< not really used in Fortran any more; see above */ + kmp_int32 reserved_3; /**< source[4] in Fortran, do not use for C++ */ + char const *psource; /**< String describing the source location. + The string is composed of semi-colon separated fields + which describe the source file, the function and a pair + of line numbers that delimit the construct. */ +} ident_t; + +// parallel defs +typedef ident_t kmp_Ident; +typedef void (*kmp_ParFctPtr)(int32_t *global_tid, int32_t *bound_tid, ...); +typedef void (*kmp_ReductFctPtr)(void *lhsData, void *rhsData); +typedef void (*kmp_InterWarpCopyFctPtr)(void *src, int32_t warp_num); +typedef void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, + int16_t lane_offset, + int16_t shortCircuit); +typedef void (*kmp_CopyToScratchpadFctPtr)(void *reduceData, void *scratchpad, + int32_t index, int32_t width); +typedef void (*kmp_LoadReduceFctPtr)(void *reduceData, void *scratchpad, + int32_t index, int32_t width, + int32_t reduce); +typedef void (*kmp_ListGlobalFctPtr)(void *buffer, int idx, void *reduce_data); + +// task defs +typedef struct kmp_TaskDescr kmp_TaskDescr; +typedef int32_t (*kmp_TaskFctPtr)(int32_t global_tid, kmp_TaskDescr *taskDescr); +typedef struct kmp_TaskDescr { + void *sharedPointerTable; // ptr to a table of shared var ptrs + kmp_TaskFctPtr sub; // task subroutine + int32_t partId; // unused + kmp_TaskFctPtr destructors; // destructor of c++ first private +} kmp_TaskDescr; + +// sync defs +typedef int32_t kmp_CriticalName[8]; + +//////////////////////////////////////////////////////////////////////////////// +// external interface +//////////////////////////////////////////////////////////////////////////////// + +// parallel +EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc); +EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid, + int32_t num_threads); +// simd +EXTERN void __kmpc_push_simd_limit(kmp_Ident *loc, int32_t global_tid, + int32_t simd_limit); +// aee ... not supported +// EXTERN void __kmpc_fork_call(kmp_Ident *loc, int32_t argc, kmp_ParFctPtr +// microtask, ...); +EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_end_serialized_parallel(kmp_Ident *loc, + uint32_t global_tid); +EXTERN uint16_t __kmpc_parallel_level(kmp_Ident *loc, uint32_t global_tid); + +// proc bind +EXTERN void __kmpc_push_proc_bind(kmp_Ident *loc, uint32_t global_tid, + int proc_bind); +EXTERN int omp_get_num_places(void); +EXTERN int omp_get_place_num_procs(int place_num); +EXTERN void omp_get_place_proc_ids(int place_num, int *ids); +EXTERN int omp_get_place_num(void); +EXTERN int omp_get_partition_num_places(void); +EXTERN void omp_get_partition_place_nums(int *place_nums); + +// for static (no chunk or chunk) +EXTERN void __kmpc_for_static_init_4(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_for_static_init_4u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_for_static_init_8(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN void __kmpc_for_static_init_8u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_4_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_4u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_8_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_8u_simple_spmd(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_4_simple_generic(kmp_Ident *loc, + int32_t global_tid, int32_t sched, + int32_t *plastiter, + int32_t *plower, int32_t *pupper, + int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_4u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter, + uint32_t *plower, uint32_t *pupper, int32_t *pstride, int32_t incr, + int32_t chunk); +EXTERN +void __kmpc_for_static_init_8_simple_generic(kmp_Ident *loc, + int32_t global_tid, int32_t sched, + int32_t *plastiter, + int64_t *plower, int64_t *pupper, + int64_t *pstride, int64_t incr, + int64_t chunk); +EXTERN +void __kmpc_for_static_init_8u_simple_generic( + kmp_Ident *loc, int32_t global_tid, int32_t sched, int32_t *plastiter1, + uint64_t *plower, uint64_t *pupper, int64_t *pstride, int64_t incr, + int64_t chunk); + +EXTERN void __kmpc_for_static_fini(kmp_Ident *loc, int32_t global_tid); + +// for dynamic +EXTERN void __kmpc_dispatch_init_4(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int32_t lower, int32_t upper, + int32_t incr, int32_t chunk); +EXTERN void __kmpc_dispatch_init_4u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, uint32_t lower, + uint32_t upper, int32_t incr, + int32_t chunk); +EXTERN void __kmpc_dispatch_init_8(kmp_Ident *loc, int32_t global_tid, + int32_t sched, int64_t lower, int64_t upper, + int64_t incr, int64_t chunk); +EXTERN void __kmpc_dispatch_init_8u(kmp_Ident *loc, int32_t global_tid, + int32_t sched, uint64_t lower, + uint64_t upper, int64_t incr, + int64_t chunk); + +EXTERN int __kmpc_dispatch_next_4(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, int32_t *plower, + int32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_4u(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, uint32_t *plower, + uint32_t *pupper, int32_t *pstride); +EXTERN int __kmpc_dispatch_next_8(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, int64_t *plower, + int64_t *pupper, int64_t *pstride); +EXTERN int __kmpc_dispatch_next_8u(kmp_Ident *loc, int32_t global_tid, + int32_t *plastiter, uint64_t *plower, + uint64_t *pupper, int64_t *pstride); + +EXTERN void __kmpc_dispatch_fini_4(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_4u(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_dispatch_fini_8u(kmp_Ident *loc, int32_t global_tid); + +// Support for reducing conditional lastprivate variables +EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, + int32_t global_tid, + int32_t varNum, void *array); + +// reduction +EXTERN void __kmpc_nvptx_end_reduce(int32_t global_tid); +EXTERN void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid); +EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, + void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_simd_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2( + kmp_Ident *loc, int32_t global_tid, void *global_buffer, + int32_t num_of_records, void *reduce_data, kmp_ShuffleReductFctPtr shflFct, + kmp_InterWarpCopyFctPtr cpyFct, kmp_ListGlobalFctPtr lgcpyFct, + kmp_ListGlobalFctPtr lgredFct, kmp_ListGlobalFctPtr glcpyFct, + kmp_ListGlobalFctPtr glredFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( + int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, + kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, + kmp_CopyToScratchpadFctPtr sratchFct, kmp_LoadReduceFctPtr ldFct); +EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, + int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, + int32_t global_tid, + kmp_CriticalName *crit); +EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size); +EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size); + +// sync barrier +EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_spmd(kmp_Ident *loc_ref, int32_t tid); +EXTERN void __kmpc_barrier_simple_generic(kmp_Ident *loc_ref, int32_t tid); +EXTERN int32_t __kmpc_cancel_barrier(kmp_Ident *loc, int32_t global_tid); + +// single +EXTERN int32_t __kmpc_single(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_single(kmp_Ident *loc, int32_t global_tid); + +// sync +EXTERN int32_t __kmpc_master(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_master(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_ordered(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_end_ordered(kmp_Ident *loc, int32_t global_tid); +EXTERN void __kmpc_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_end_critical(kmp_Ident *loc, int32_t global_tid, + kmp_CriticalName *crit); +EXTERN void __kmpc_flush(kmp_Ident *loc); + +// vote +EXTERN __kmpc_impl_lanemask_t __kmpc_warp_active_thread_mask(); +// syncwarp +EXTERN void __kmpc_syncwarp(__kmpc_impl_lanemask_t); + +// tasks +EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc, + uint32_t global_tid, int32_t flag, + size_t sizeOfTaskInclPrivate, + size_t sizeOfSharedTable, + kmp_TaskFctPtr sub); +EXTERN int32_t __kmpc_omp_task(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN int32_t __kmpc_omp_task_with_deps(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); +EXTERN void __kmpc_omp_task_begin_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_task_complete_if0(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newLegacyTaskDescr); +EXTERN void __kmpc_omp_wait_deps(kmp_Ident *loc, uint32_t global_tid, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); +EXTERN void __kmpc_taskgroup(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_end_taskgroup(kmp_Ident *loc, uint32_t global_tid); +EXTERN int32_t __kmpc_omp_taskyield(kmp_Ident *loc, uint32_t global_tid, + int end_part); +EXTERN int32_t __kmpc_omp_taskwait(kmp_Ident *loc, uint32_t global_tid); +EXTERN void __kmpc_taskloop(kmp_Ident *loc, uint32_t global_tid, + kmp_TaskDescr *newKmpTaskDescr, int if_val, + uint64_t *lb, uint64_t *ub, int64_t st, int nogroup, + int32_t sched, uint64_t grainsize, void *task_dup); + +// cancel +EXTERN int32_t __kmpc_cancellationpoint(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal); +EXTERN int32_t __kmpc_cancel(kmp_Ident *loc, int32_t global_tid, + int32_t cancelVal); + +// non standard +EXTERN void __kmpc_kernel_init_params(void *ReductionScratchpadPtr); +EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime); +EXTERN void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime, + int16_t RequiresDataSharing); +EXTERN __attribute__((deprecated)) void __kmpc_spmd_kernel_deinit(); +EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime); +EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn, + int16_t IsOMPRuntimeInitialized); +EXTERN bool __kmpc_kernel_parallel(void **WorkFn, + int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_kernel_end_parallel(); +EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, + __kmpc_impl_lanemask_t Mask, + bool *IsFinal, + int32_t *LaneSource); +EXTERN void __kmpc_kernel_end_convergent_parallel(void *buffer); +EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, + __kmpc_impl_lanemask_t Mask, + bool *IsFinal, int32_t *LaneSource, + int32_t *LaneId, int32_t *NumLanes); +EXTERN void __kmpc_kernel_end_convergent_simd(void *buffer); + + +EXTERN void __kmpc_data_sharing_init_stack(); +EXTERN void __kmpc_data_sharing_init_stack_spmd(); +EXTERN void *__kmpc_data_sharing_coalesced_push_stack(size_t size, + int16_t UseSharedMemory); +EXTERN void *__kmpc_data_sharing_push_stack(size_t size, int16_t UseSharedMemory); +EXTERN void __kmpc_data_sharing_pop_stack(void *a); +EXTERN void __kmpc_begin_sharing_variables(void ***GlobalArgs, size_t nArgs); +EXTERN void __kmpc_end_sharing_variables(); +EXTERN void __kmpc_get_shared_variables(void ***GlobalArgs); + +// The slot used for data sharing by the master and worker threads. We use a +// complete (default size version and an incomplete one so that we allow sizes +// greater than the default). +struct __kmpc_data_sharing_slot { + __kmpc_data_sharing_slot *Next; + __kmpc_data_sharing_slot *Prev; + void *PrevSlotStackPtr; + void *DataEnd; + char Data[]; +}; +EXTERN void +__kmpc_initialize_data_sharing_environment(__kmpc_data_sharing_slot *RootS, + size_t InitialDataSize); +EXTERN void *__kmpc_data_sharing_environment_begin( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, + size_t SharingDataSize, size_t SharingDefaultDataSize, + int16_t IsOMPRuntimeInitialized); +EXTERN void __kmpc_data_sharing_environment_end( + __kmpc_data_sharing_slot **SavedSharedSlot, void **SavedSharedStack, + void **SavedSharedFrame, __kmpc_impl_lanemask_t *SavedActiveThreads, + int32_t IsEntryPoint); + +EXTERN void * +__kmpc_get_data_sharing_environment_frame(int32_t SourceThreadID, + int16_t IsOMPRuntimeInitialized); + +// SPMD execution mode interrogation function. +EXTERN int8_t __kmpc_is_spmd_exec_mode(); + +EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode, + const void *buf, size_t size, + int16_t is_shared, const void **res); + +EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode, + int16_t is_shared); + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt index 84b52f55b73d9..2cbddd17baecc 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/CMakeLists.txt @@ -1,199 +1,199 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available -# -##===----------------------------------------------------------------------===## - -set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING - "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.") - -if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER) - find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER}) - if(NOT ALTERNATE_CUDA_HOST_COMPILER) - libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.") - endif() - set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE) -endif() - -# We can't use clang as nvcc host preprocessor, so we attempt to replace it with -# gcc. -if(CUDA_HOST_COMPILER MATCHES clang) - - find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc) - - if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER) - libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.") - libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.") - return() - endif() - set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE) -endif() - -get_filename_component(devicertl_base_directory - ${CMAKE_CURRENT_SOURCE_DIR} - DIRECTORY) -set(devicertl_common_directory - ${devicertl_base_directory}/common) -set(devicertl_nvptx_directory - ${devicertl_base_directory}/nvptx) - -if(LIBOMPTARGET_DEP_CUDA_FOUND) - libomptarget_say("Building CUDA offloading device RTL.") - - # We really don't have any host code, so we don't need to care about - # propagating host flags. - set(CUDA_PROPAGATE_HOST_FLAGS OFF) - - set(cuda_src_files - ${devicertl_common_directory}/src/cancel.cu - ${devicertl_common_directory}/src/critical.cu - ${devicertl_common_directory}/src/data_sharing.cu - ${devicertl_common_directory}/src/libcall.cu - ${devicertl_common_directory}/src/loop.cu - ${devicertl_common_directory}/src/omptarget.cu - ${devicertl_common_directory}/src/parallel.cu - ${devicertl_common_directory}/src/reduction.cu - ${devicertl_common_directory}/src/support.cu - ${devicertl_common_directory}/src/sync.cu - ${devicertl_common_directory}/src/task.cu - src/target_impl.cu - ) - - set(omp_data_objects ${devicertl_common_directory}/src/omp_data.cu) - - # Get the compute capability the user requested or use SM_35 by default. - # SM_35 is what clang uses by default. - set(default_capabilities 35) - if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY) - set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY}) - libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES") - endif() - set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING - "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.") - string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}) - - foreach(sm ${nvptx_sm_list}) - set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm}) - endforeach() - - # Activate RTL message dumps if requested by the user. - set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL - "Activate NVPTX device RTL debug messages.") - if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v) - endif() - - # NVPTX runtime library has to be statically linked. Dynamic linking is not - # yet supported by the CUDA toolchain on the device. - set(BUILD_SHARED_LIBS OFF) - set(CUDA_SEPARABLE_COMPILATION ON) - list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory} - -I${devicertl_nvptx_directory}/src) - cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects} - OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) - - # Install device RTL under the lib destination folder. - install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}") - - target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES}) - - - # Check if we can create an LLVM bitcode implementation of the runtime library - # that could be inlined in the user application. For that we need to find - # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and - # an LLVM linker. - set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING - "Location of a CUDA compiler capable of emitting LLVM bitcode.") - set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING - "Location of a linker capable of linking LLVM bitcode objects.") - - include(LibomptargetNVPTXBitcodeLibrary) - - set(bclib_default FALSE) - if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) - set(bclib_default TRUE) - endif() - set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL - "Enable CUDA LLVM bitcode offloading device RTL.") - if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB}) - if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) - libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!") - endif() - libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") - - # Set flags for LLVM Bitcode compilation. - set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} - -I${devicertl_base_directory} - -I${devicertl_nvptx_directory}/src) - - if(${LIBOMPTARGET_NVPTX_DEBUG}) - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) - else() - set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) - endif() - - # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared - # to handle. Therefore, we use 'weak' instead. We are compiling only for the - # device, so it should be equivalent. - if(CUDA_VERSION_MAJOR GREATER 8) - set(bc_flags ${bc_flags} -Dnv_weak=weak) - endif() - - # Create target to build all Bitcode libraries. - add_custom_target(omptarget-nvptx-bc) - - # Generate a Bitcode library for all the compute capabilities the user requested. - foreach(sm ${nvptx_sm_list}) - set(cuda_arch --cuda-gpu-arch=sm_${sm}) - - # Compile CUDA files to bitcode. - set(bc_files "") - foreach(src ${cuda_src_files}) - get_filename_component(infile ${src} ABSOLUTE) - get_filename_component(outfile ${src} NAME) - - add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} - -c ${infile} -o ${outfile}-sm_${sm}.bc - DEPENDS ${infile} - IMPLICIT_DEPENDS CXX ${infile} - COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" - VERBATIM - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) - - list(APPEND bc_files ${outfile}-sm_${sm}.bc) - endforeach() - - # Link to a bitcode library. - add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} - -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} - DEPENDS ${bc_files} - COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" - ) - set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) - - add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) - add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) - - # Copy library to destination. - add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc - $) - - # Install bitcode library under the lib destination folder. - install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") - endforeach() - endif() - - add_subdirectory(test) -else() - libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.") -endif() +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build the NVPTX (CUDA) Device RTL if the CUDA tools are available +# +##===----------------------------------------------------------------------===## + +set(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER "" CACHE STRING + "Path to alternate NVCC host compiler to be used by the NVPTX device RTL.") + +if(LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER) + find_program(ALTERNATE_CUDA_HOST_COMPILER NAMES ${LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER}) + if(NOT ALTERNATE_CUDA_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: invalid NVPTX alternate host compiler.") + endif() + set(CUDA_HOST_COMPILER ${ALTERNATE_CUDA_HOST_COMPILER} CACHE FILEPATH "" FORCE) +endif() + +# We can't use clang as nvcc host preprocessor, so we attempt to replace it with +# gcc. +if(CUDA_HOST_COMPILER MATCHES clang) + + find_program(LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER NAMES gcc) + + if(NOT LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER) + libomptarget_say("Not building CUDA offloading device RTL: clang is not supported as NVCC host compiler.") + libomptarget_say("Please include gcc in your path or set LIBOMPTARGET_NVPTX_ALTERNATE_HOST_COMPILER to the full path of of valid compiler.") + return() + endif() + set(CUDA_HOST_COMPILER "${LIBOMPTARGET_NVPTX_ALTERNATE_GCC_HOST_COMPILER}" CACHE FILEPATH "" FORCE) +endif() + +get_filename_component(devicertl_base_directory + ${CMAKE_CURRENT_SOURCE_DIR} + DIRECTORY) +set(devicertl_common_directory + ${devicertl_base_directory}/common) +set(devicertl_nvptx_directory + ${devicertl_base_directory}/nvptx) + +if(LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_say("Building CUDA offloading device RTL.") + + # We really don't have any host code, so we don't need to care about + # propagating host flags. + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + + set(cuda_src_files + ${devicertl_common_directory}/src/cancel.cu + ${devicertl_common_directory}/src/critical.cu + ${devicertl_common_directory}/src/data_sharing.cu + ${devicertl_common_directory}/src/libcall.cu + ${devicertl_common_directory}/src/loop.cu + ${devicertl_common_directory}/src/omptarget.cu + ${devicertl_common_directory}/src/parallel.cu + ${devicertl_common_directory}/src/reduction.cu + ${devicertl_common_directory}/src/support.cu + ${devicertl_common_directory}/src/sync.cu + ${devicertl_common_directory}/src/task.cu + src/target_impl.cu + ) + + set(omp_data_objects ${devicertl_common_directory}/src/omp_data.cu) + + # Get the compute capability the user requested or use SM_35 by default. + # SM_35 is what clang uses by default. + set(default_capabilities 35) + if (DEFINED LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY) + set(default_capabilities ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY}) + libomptarget_warning_say("LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITY is deprecated, please use LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES") + endif() + set(LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES ${default_capabilities} CACHE STRING + "List of CUDA Compute Capabilities to be used to compile the NVPTX device RTL.") + string(REPLACE "," ";" nvptx_sm_list ${LIBOMPTARGET_NVPTX_COMPUTE_CAPABILITIES}) + + foreach(sm ${nvptx_sm_list}) + set(CUDA_ARCH ${CUDA_ARCH} -gencode arch=compute_${sm},code=sm_${sm}) + endforeach() + + # Activate RTL message dumps if requested by the user. + set(LIBOMPTARGET_NVPTX_DEBUG FALSE CACHE BOOL + "Activate NVPTX device RTL debug messages.") + if(${LIBOMPTARGET_NVPTX_DEBUG}) + set(CUDA_DEBUG -DOMPTARGET_NVPTX_DEBUG=-1 -g --ptxas-options=-v) + endif() + + # NVPTX runtime library has to be statically linked. Dynamic linking is not + # yet supported by the CUDA toolchain on the device. + set(BUILD_SHARED_LIBS OFF) + set(CUDA_SEPARABLE_COMPILATION ON) + list(APPEND CUDA_NVCC_FLAGS -I${devicertl_base_directory} + -I${devicertl_nvptx_directory}/src) + cuda_add_library(omptarget-nvptx STATIC ${cuda_src_files} ${omp_data_objects} + OPTIONS ${CUDA_ARCH} ${CUDA_DEBUG}) + + # Install device RTL under the lib destination folder. + install(TARGETS omptarget-nvptx ARCHIVE DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + target_link_libraries(omptarget-nvptx ${CUDA_LIBRARIES}) + + + # Check if we can create an LLVM bitcode implementation of the runtime library + # that could be inlined in the user application. For that we need to find + # a Clang compiler capable of compiling our CUDA files to LLVM bitcode and + # an LLVM linker. + set(LIBOMPTARGET_NVPTX_CUDA_COMPILER "" CACHE STRING + "Location of a CUDA compiler capable of emitting LLVM bitcode.") + set(LIBOMPTARGET_NVPTX_BC_LINKER "" CACHE STRING + "Location of a linker capable of linking LLVM bitcode objects.") + + include(LibomptargetNVPTXBitcodeLibrary) + + set(bclib_default FALSE) + if (${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + set(bclib_default TRUE) + endif() + set(LIBOMPTARGET_NVPTX_ENABLE_BCLIB ${bclib_default} CACHE BOOL + "Enable CUDA LLVM bitcode offloading device RTL.") + if (${LIBOMPTARGET_NVPTX_ENABLE_BCLIB}) + if (NOT ${LIBOMPTARGET_NVPTX_BCLIB_SUPPORTED}) + libomptarget_error_say("Cannot build CUDA LLVM bitcode offloading device RTL!") + endif() + libomptarget_say("Building CUDA LLVM bitcode offloading device RTL.") + + # Set flags for LLVM Bitcode compilation. + set(bc_flags ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER_FLAGS} + -I${devicertl_base_directory} + -I${devicertl_nvptx_directory}/src) + + if(${LIBOMPTARGET_NVPTX_DEBUG}) + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=-1) + else() + set(bc_flags ${bc_flags} -DOMPTARGET_NVPTX_DEBUG=0) + endif() + + # CUDA 9 header files use the nv_weak attribute which clang is not yet prepared + # to handle. Therefore, we use 'weak' instead. We are compiling only for the + # device, so it should be equivalent. + if(CUDA_VERSION_MAJOR GREATER 8) + set(bc_flags ${bc_flags} -Dnv_weak=weak) + endif() + + # Create target to build all Bitcode libraries. + add_custom_target(omptarget-nvptx-bc) + + # Generate a Bitcode library for all the compute capabilities the user requested. + foreach(sm ${nvptx_sm_list}) + set(cuda_arch --cuda-gpu-arch=sm_${sm}) + + # Compile CUDA files to bitcode. + set(bc_files "") + foreach(src ${cuda_src_files}) + get_filename_component(infile ${src} ABSOLUTE) + get_filename_component(outfile ${src} NAME) + + add_custom_command(OUTPUT ${outfile}-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_CUDA_COMPILER} ${bc_flags} ${cuda_arch} + -c ${infile} -o ${outfile}-sm_${sm}.bc + DEPENDS ${infile} + IMPLICIT_DEPENDS CXX ${infile} + COMMENT "Building LLVM bitcode ${outfile}-sm_${sm}.bc" + VERBATIM + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES ${outfile}-sm_${sm}.bc) + + list(APPEND bc_files ${outfile}-sm_${sm}.bc) + endforeach() + + # Link to a bitcode library. + add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + COMMAND ${LIBOMPTARGET_NVPTX_SELECTED_BC_LINKER} + -o ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc ${bc_files} + DEPENDS ${bc_files} + COMMENT "Linking LLVM bitcode libomptarget-nvptx-sm_${sm}.bc" + ) + set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES libomptarget-nvptx-sm_${sm}.bc) + + add_custom_target(omptarget-nvptx-${sm}-bc ALL DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc) + add_dependencies(omptarget-nvptx-bc omptarget-nvptx-${sm}-bc) + + # Copy library to destination. + add_custom_command(TARGET omptarget-nvptx-${sm}-bc POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc + $) + + # Install bitcode library under the lib destination folder. + install(FILES ${CMAKE_CURRENT_BINARY_DIR}/libomptarget-nvptx-sm_${sm}.bc DESTINATION "${OPENMP_INSTALL_LIBDIR}") + endforeach() + endif() + + add_subdirectory(test) +else() + libomptarget_say("Not building CUDA offloading device RTL: CUDA tools not found in the system.") +endif() diff --git a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt b/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt index 4149dfacb62ad..45c3208577401 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/docs/ReductionDesign.txt @@ -1,523 +1,523 @@ - -**Design document for OpenMP reductions on the GPU** - -//Abstract: //In this document we summarize the new design for an OpenMP -implementation of reductions on NVIDIA GPUs. This document comprises -* a succinct background review, -* an introduction to the decoupling of reduction algorithm and - data-structure-specific processing routines, -* detailed illustrations of reduction algorithms used and -* a brief overview of steps we have made beyond the last implementation. - -**Problem Review** - -Consider a typical OpenMP program with reduction pragma. - -``` - double foo, bar; - #pragma omp parallel for reduction(+:foo, bar) - for (int i = 0; i < N; i++) { - foo+=A[i]; bar+=B[i]; - } -``` -where 'foo' and 'bar' are reduced across all threads in the parallel region. -Our primary goal is to efficiently aggregate the values of foo and bar in -such manner that -* makes the compiler logically concise. -* efficiently reduces within warps, threads, blocks and the device. - -**Introduction to Decoupling** -In this section we address the problem of making the compiler -//logically concise// by partitioning the task of reduction into two broad -categories: data-structure specific routines and algorithmic routines. - -The previous reduction implementation was highly coupled with -the specificity of the reduction element data structures (e.g., sizes, data -types) and operators of the reduction (e.g., addition, multiplication). In -our implementation we strive to decouple them. In our final implementations, -we could remove all template functions in our runtime system. - -The (simplified) pseudo code generated by LLVM is as follows: - -``` - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, - interWarpCpyFn) - where: - struct ReduceData { - double *foo; - double *bar; - } reduceData - reduceData.foo = &foo_p - reduceData.bar = &bar_p - - shuffleReduceFn and interWarpCpyFn are two auxiliary functions - generated to aid the runtime performing algorithmic steps - while being data-structure agnostic about ReduceData. - - In particular, shuffleReduceFn is a function that takes the following - inputs: - a. local copy of ReduceData - b. its lane_id - c. the offset of the lane_id which hosts a remote ReduceData - relative to the current one - d. an algorithm version parameter determining which reduction - algorithm to use. - This shuffleReduceFn retrieves the remote ReduceData through shuffle - intrinsics and reduces, using the algorithm specified by the 4th - parameter, the local ReduceData and with the remote ReduceData element - wise, and places the resultant values into the local ReduceData. - - Different reduction algorithms are implemented with different runtime - functions, but they all make calls to this same shuffleReduceFn to - perform the essential reduction step. Therefore, based on the 4th - parameter, this shuffleReduceFn will behave slightly differently to - cooperate with the runtime function to ensure correctness under - different circumstances. - - InterWarpCpyFn, as the name suggests, is a function that copies data - across warps. Its function is to tunnel all the thread private - ReduceData that is already reduced within a warp to a lane in the first - warp with minimal shared memory footprint. This is an essential step to - prepare for the last step of a block reduction. - - (Warp, block, device level reduction routines that utilize these - auxiliary functions will be discussed in the next section.) - - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar -``` - -**Reduction Algorithms** - -On the warp level, we have three versions of the algorithms: - -1. Full Warp Reduction - -``` -gpu_regular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr ShuffleReduceFn) { - for (int offset = WARPSIZE/2; offset > 0; offset /= 2) - ShuffleReduceFn(reduce_data, 0, offset, 0); -} -``` -ShuffleReduceFn is used here with lane_id set to 0 because it is not used -therefore we save instructions by not retrieving lane_id from the corresponding -special registers. The 4th parameters, which represents the version of the -algorithm being used here, is set to 0 to signify full warp reduction. - -In this version specified (=0), the ShuffleReduceFn behaves, per element, as -follows: - -``` -//reduce_elem refers to an element in the local ReduceData -//remote_elem is retrieved from a remote lane -remote_elem = shuffle_down(reduce_elem, offset, 32); -reduce_elem = reduce_elem @ remote_elem; - -``` - -An illustration of this algorithm operating on a hypothetical 8-lane full-warp -would be: -{F74} -The coloring invariant follows that elements with the same color will be -combined and reduced in the next reduction step. As can be observed, no overhead -is present, exactly log(2, N) steps are needed. - -2. Contiguous Full Warp Reduction -``` -gpu_irregular_warp_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr ShuffleReduceFn, int size, - int lane_id) { - int curr_size; - int offset; - curr_size = size; - mask = curr_size/2; - while (offset>0) { - ShuffleReduceFn(reduce_data, lane_id, offset, 1); - curr_size = (curr_size+1)/2; - offset = curr_size/2; - } -} -``` - -In this version specified (=1), the ShuffleReduceFn behaves, per element, as -follows: -``` -//reduce_elem refers to an element in the local ReduceData -//remote_elem is retrieved from a remote lane -remote_elem = shuffle_down(reduce_elem, offset, 32); -if (lane_id < offset) { - reduce_elem = reduce_elem @ remote_elem -} else { - reduce_elem = remote_elem -} -``` - -An important invariant (also a restriction on the starting state of the -reduction) is that this algorithm assumes that all unused ReduceData are -located in a contiguous subset of threads in a warp starting from lane 0. - -With the presence of a trailing active lane with an odd-numbered lane -id, its value will not be aggregated with any other lane. Therefore, -in order to preserve the invariant, such ReduceData is copied to the first lane -whose thread-local ReduceData has already being used in a previous reduction -and would therefore be useless otherwise. - -An illustration of this algorithm operating on a hypothetical 8-lane partial -warp woud be: -{F75} - -As illustrated, this version of the algorithm introduces overhead whenever -we have odd number of participating lanes in any reduction step to -copy data between lanes. - -3. Dispersed Partial Warp Reduction -``` -gpu_irregular_simt_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr ShuffleReduceFn) { - int size, remote_id; - int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2; - do { - remote_id = find_the_next_active_lane_id_right_after_me(); - // the above function returns 0 of no active lane - // is present right after the current thread. - size = get_number_of_active_lanes_in_this_warp(); - logical_lane_id /= 2; - ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2); - } while (logical_lane_id % 2 == 0 && size > 1); -``` - -There is no assumption made about the initial state of the reduction. -Any number of lanes (>=1) could be active at any position. The reduction -result is kept in the first active lane. - -In this version specified (=2), the ShuffleReduceFn behaves, per element, as -follows: -``` -//reduce_elem refers to an element in the local ReduceData -//remote_elem is retrieved from a remote lane -remote_elem = shuffle_down(reduce_elem, offset, 32); -if (LaneId % 2 == 0 && Offset > 0) { - reduce_elem = reduce_elem @ remote_elem -} else { - reduce_elem = remote_elem -} -``` -We will proceed with a brief explanation for some arguments passed in, -it is important to notice that, in this section, we will introduce the -concept of logical_lane_id, and it is important to distinguish it -from physical lane_id as defined by nvidia. -1. //logical_lane_id//: as the name suggests, it refers to the calculated - lane_id (instead of the physical one defined by nvidia) that would make - our algorithm logically concise. A thread with logical_lane_id k means - there are (k-1) threads before it. -2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane - id of the remote lane from which we will retrieve the ReduceData. We - subtract (threadIdx+1) from it because we would like to maintain only one - underlying shuffle intrinsic (which is used to communicate among lanes in a - warp). This particular version of shuffle intrinsic we take accepts only - offsets, instead of absolute lane_id. Therefore the subtraction is performed - on the absolute lane_id we calculated to obtain the offset. - -This algorithm is slightly different in 2 ways and it is not, conceptually, a -generalization of the above algorithms. -1. It reduces elements close to each other. For instance, values in the 0th lane - is to be combined with that of the 1st lane; values in the 2nd lane is to be - combined with that of the 3rd lane. We did not use the previous algorithm - where the first half of the (partial) warp is reduced with the second half - of the (partial) warp. This is because, the mapping - f(x): logical_lane_id -> physical_lane_id; - can be easily calculated whereas its inverse - f^-1(x): physical_lane_id -> logical_lane_id - cannot and performing such reduction requires the inverse to be known. -2. Because this algorithm is agnostic about the positions of the lanes that are - active, we do not need to perform the coping step as in the second - algorithm. -An illustrative run would look like -{F76} -As observed, overhead is high because in each and every step of reduction, -logical_lane_id is recalculated; so is the remote_id. - -On a block level, we have implemented the following block reduce algorithm: - -``` -gpu_irregular_block_reduce(void *reduce_data, - kmp_ShuffleReductFctPtr shuflReduceFn, - kmp_InterWarpCopyFctPtr interWarpCpyFn, - int size) { - - int wid = threadIdx.x/WARPSIZE; - int lane_id = threadIdx.x%WARPSIZE; - - int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division - - unsigned tnum = __ballot(1); - int thread_num = __popc(tnum); - - //full warp reduction - if (thread_num == WARPSIZE) { - gpu_regular_warp_reduce(reduce_data, shuflReduceFn); - } - //partial warp reduction - if (thread_num < WARPSIZE) { - gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num, - lane_id); - } - //Gather all the reduced values from each warp - //to the first warp - //named_barrier inside this function to ensure - //correctness. It is effectively a sync_thread - //that won't deadlock. - interWarpCpyFn(reduce_data, warp_needed); - - //This is to reduce data gathered from each "warp master". - if (wid==0) { - gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed, - lane_id); - } - - return; -} -``` -In this function, no ShuffleReduceFn is directly called as it makes calls -to various versions of the warp-reduction functions. It first reduces -ReduceData warp by warp; in the end, we end up with the number of -ReduceData equal to the number of warps present in this thread -block. We then proceed to gather all such ReduceData to the first warp. - -As observed, in this algorithm we make use of the function InterWarpCpyFn, -which copies data from each of the "warp master" (0th lane of each warp, where -a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a -mathematical sense) the problem of reduction across warp masters in a block to -the problem of warp reduction which we already have solutions to. - -We can thus completely avoid the use of atomics to reduce in a threadblock. - -**Efficient Cross Block Reduce** - -The next challenge is to reduce values across threadblocks. We aim to do this -without atomics or critical sections. - -Let a kernel be started with TB threadblocks. -Let the GPU have S SMs. -There can be at most N active threadblocks per SM at any time. - -Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of -at most 'N' active threadblocks on SM s. Let each threadblock active on an SM -be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id) -uniquely identifies an active threadblock on the GPU. - -To efficiently implement cross block reduce, we first allocate an array for -each value to be reduced of size S*N (which is the maximum number of active -threadblocks at any time on the device). - -Each threadblock reduces its value to slot [s][id]. This can be done without -locking since no other threadblock can write to the same slot concurrently. - -As a final stage, we reduce the values in the array as follows: - -``` -// Compiler generated wrapper function for each target region with a reduction -clause. -target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1 - thread. - // Use dynamic parallelism to launch M teams, N threads as requested by the - user to execute the target region. - - target_function<>(map_args) - - Reduce values in reduction_array - -``` - -**Comparison with Last Version** - - -The (simplified) pseudo code generated by LLVM on the host is as follows: - - -``` - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) - where: - struct ReduceData { - double *foo; - double *bar; - } reduceData - reduceData.foo = &foo_p - reduceData.bar = &bar_p - - reduceFn is a pointer to a function that takes in two inputs - of type ReduceData, "reduces" them element wise, and places the - result in the first input: - reduceFn(ReduceData *a, ReduceData *b) - a = a @ b - - Every thread in the parallel region calls kmpc_reduce_nowait with - its private copy of reduceData. The runtime reduces across the - threads (using tree reduction on the operator 'reduceFn?) and stores - the final result in the master thread if successful. - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar - 5. else if ret == 2: - In this case kmpc_reduce_nowait() could not use tree reduction, - so use atomics instead: - each thread atomically writes to foo - each thread atomically writes to bar -``` - -On a GPU, a similar reduction may need to be performed across SIMT threads, -warps, and threadblocks. The challenge is to do so efficiently in a fashion -that is compatible with the LLVM OpenMP implementation. - -In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs, -the salient steps of the code generated are as follows: - - -``` - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) - status = can_block_reduce() - if status == 1: - reduce efficiently to thread 0 using shuffles and shared memory. - return 1 - else - cannot use efficient block reduction, fallback to atomics - return 2 - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar - 5. else if ret == 2: - In this case kmpc_reduce_nowait() could not use tree reduction, - so use atomics instead: - each thread atomically writes to foo - each thread atomically writes to bar -``` - -The function can_block_reduce() is defined as follows: - - -``` -int32_t can_block_reduce() { - int tid = GetThreadIdInTeam(); - int nt = GetNumberOfOmpThreads(tid); - if (nt != blockDim.x) - return 0; - unsigned tnum = __ballot(1); - if (tnum != (~0x0)) { - return 0; - } - return 1; -} -``` - -This function permits the use of the efficient block reduction algorithm -using shuffles and shared memory (return 1) only if (a) all SIMT threads in -a warp are active (i.e., number of threads in the parallel region is a -multiple of 32) and (b) the number of threads in the parallel region -(set by the num_threads clause) equals blockDim.x. - -If either of these preconditions is not true, each thread in the threadblock -updates the global value using atomics. - -Atomics and compare-and-swap operations are expensive on many threaded -architectures such as GPUs and we must avoid them completely. - - -**Appendix: Implementation Details** - - -``` -// Compiler generated function. -reduceFn(ReduceData *a, ReduceData *b) - a->foo = a->foo + b->foo - a->bar = a->bar + b->bar - -// Compiler generated function. -swapAndReduceFn(ReduceData *thread_private, int lane) - ReduceData *remote = new ReduceData() - remote->foo = shuffle_double(thread_private->foo, lane) - remote->bar = shuffle_double(thread_private->bar, lane) - reduceFn(thread_private, remote) - -// OMP runtime function. -warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn): - offset = 16 - while (offset > 0) - swapAndReduceFn(thread_private, offset) - offset /= 2 - -// OMP runtime function. -warpReduce_irregular(): - ... - -// OMP runtime function. -kmpc_reduce_warp(reduceData, swapAndReduceFn) - if all_lanes_active: - warpReduce_regular(reduceData, swapAndReduceFn) - else: - warpReduce_irregular(reduceData, swapAndReduceFn) - if in_simd_region: - // all done, reduce to global in simd lane 0 - return 1 - else if in_parallel_region: - // done reducing to one value per warp, now reduce across warps - return 3 - -// OMP runtime function; one for each basic type. -kmpc_reduce_block_double(double *a) - if lane == 0: - shared[wid] = *a - named_barrier(1, num_threads) - if wid == 0 - block_reduce(shared) - if lane == 0 - *a = shared[0] - named_barrier(1, num_threads) - if wid == 0 and lane == 0 - return 1 // write back reduced result - else - return 0 // don't do anything - -``` - - - -``` -// Compiler generated code. - 1. Create private copies of variables: foo_p, bar_p - 2. Each thread reduces the chunk of A and B assigned to it and writes - to foo_p and bar_p respectively. - 3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn) - 4. if ret == 1: - The master thread stores the reduced result in the globals. - foo += reduceData.foo; bar += reduceData.bar - 5. else if ret == 3: - ret = block_reduce_double(reduceData.foo) - if ret == 1: - foo += reduceData.foo - ret = block_reduce_double(reduceData.bar) - if ret == 1: - bar += reduceData.bar -``` - -**Notes** - - 1. This scheme requires that the CUDA OMP runtime can call llvm generated - functions. This functionality now works. - 2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery - (including calls through function pointers) are optimized away. - 3. If we are reducing multiple to multiple variables in a parallel region, - the reduce operations are all performed in warpReduce_[ir]regular(). This - results in more instructions in the loop and should result in fewer - stalls due to data dependencies. Unfortunately we cannot do the same in - kmpc_reduce_block_double() without increasing shared memory usage. + +**Design document for OpenMP reductions on the GPU** + +//Abstract: //In this document we summarize the new design for an OpenMP +implementation of reductions on NVIDIA GPUs. This document comprises +* a succinct background review, +* an introduction to the decoupling of reduction algorithm and + data-structure-specific processing routines, +* detailed illustrations of reduction algorithms used and +* a brief overview of steps we have made beyond the last implementation. + +**Problem Review** + +Consider a typical OpenMP program with reduction pragma. + +``` + double foo, bar; + #pragma omp parallel for reduction(+:foo, bar) + for (int i = 0; i < N; i++) { + foo+=A[i]; bar+=B[i]; + } +``` +where 'foo' and 'bar' are reduced across all threads in the parallel region. +Our primary goal is to efficiently aggregate the values of foo and bar in +such manner that +* makes the compiler logically concise. +* efficiently reduces within warps, threads, blocks and the device. + +**Introduction to Decoupling** +In this section we address the problem of making the compiler +//logically concise// by partitioning the task of reduction into two broad +categories: data-structure specific routines and algorithmic routines. + +The previous reduction implementation was highly coupled with +the specificity of the reduction element data structures (e.g., sizes, data +types) and operators of the reduction (e.g., addition, multiplication). In +our implementation we strive to decouple them. In our final implementations, +we could remove all template functions in our runtime system. + +The (simplified) pseudo code generated by LLVM is as follows: + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_nvptx_reduce_nowait(..., reduceData, shuffleReduceFn, + interWarpCpyFn) + where: + struct ReduceData { + double *foo; + double *bar; + } reduceData + reduceData.foo = &foo_p + reduceData.bar = &bar_p + + shuffleReduceFn and interWarpCpyFn are two auxiliary functions + generated to aid the runtime performing algorithmic steps + while being data-structure agnostic about ReduceData. + + In particular, shuffleReduceFn is a function that takes the following + inputs: + a. local copy of ReduceData + b. its lane_id + c. the offset of the lane_id which hosts a remote ReduceData + relative to the current one + d. an algorithm version parameter determining which reduction + algorithm to use. + This shuffleReduceFn retrieves the remote ReduceData through shuffle + intrinsics and reduces, using the algorithm specified by the 4th + parameter, the local ReduceData and with the remote ReduceData element + wise, and places the resultant values into the local ReduceData. + + Different reduction algorithms are implemented with different runtime + functions, but they all make calls to this same shuffleReduceFn to + perform the essential reduction step. Therefore, based on the 4th + parameter, this shuffleReduceFn will behave slightly differently to + cooperate with the runtime function to ensure correctness under + different circumstances. + + InterWarpCpyFn, as the name suggests, is a function that copies data + across warps. Its function is to tunnel all the thread private + ReduceData that is already reduced within a warp to a lane in the first + warp with minimal shared memory footprint. This is an essential step to + prepare for the last step of a block reduction. + + (Warp, block, device level reduction routines that utilize these + auxiliary functions will be discussed in the next section.) + + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar +``` + +**Reduction Algorithms** + +On the warp level, we have three versions of the algorithms: + +1. Full Warp Reduction + +``` +gpu_regular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn) { + for (int offset = WARPSIZE/2; offset > 0; offset /= 2) + ShuffleReduceFn(reduce_data, 0, offset, 0); +} +``` +ShuffleReduceFn is used here with lane_id set to 0 because it is not used +therefore we save instructions by not retrieving lane_id from the corresponding +special registers. The 4th parameters, which represents the version of the +algorithm being used here, is set to 0 to signify full warp reduction. + +In this version specified (=0), the ShuffleReduceFn behaves, per element, as +follows: + +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +reduce_elem = reduce_elem @ remote_elem; + +``` + +An illustration of this algorithm operating on a hypothetical 8-lane full-warp +would be: +{F74} +The coloring invariant follows that elements with the same color will be +combined and reduced in the next reduction step. As can be observed, no overhead +is present, exactly log(2, N) steps are needed. + +2. Contiguous Full Warp Reduction +``` +gpu_irregular_warp_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn, int size, + int lane_id) { + int curr_size; + int offset; + curr_size = size; + mask = curr_size/2; + while (offset>0) { + ShuffleReduceFn(reduce_data, lane_id, offset, 1); + curr_size = (curr_size+1)/2; + offset = curr_size/2; + } +} +``` + +In this version specified (=1), the ShuffleReduceFn behaves, per element, as +follows: +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +if (lane_id < offset) { + reduce_elem = reduce_elem @ remote_elem +} else { + reduce_elem = remote_elem +} +``` + +An important invariant (also a restriction on the starting state of the +reduction) is that this algorithm assumes that all unused ReduceData are +located in a contiguous subset of threads in a warp starting from lane 0. + +With the presence of a trailing active lane with an odd-numbered lane +id, its value will not be aggregated with any other lane. Therefore, +in order to preserve the invariant, such ReduceData is copied to the first lane +whose thread-local ReduceData has already being used in a previous reduction +and would therefore be useless otherwise. + +An illustration of this algorithm operating on a hypothetical 8-lane partial +warp woud be: +{F75} + +As illustrated, this version of the algorithm introduces overhead whenever +we have odd number of participating lanes in any reduction step to +copy data between lanes. + +3. Dispersed Partial Warp Reduction +``` +gpu_irregular_simt_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr ShuffleReduceFn) { + int size, remote_id; + int logical_lane_id = find_number_of_dispersed_active_lanes_before_me() * 2; + do { + remote_id = find_the_next_active_lane_id_right_after_me(); + // the above function returns 0 of no active lane + // is present right after the current thread. + size = get_number_of_active_lanes_in_this_warp(); + logical_lane_id /= 2; + ShuffleReduceFn(reduce_data, logical_lane_id, remote_id-1-threadIdx.x, 2); + } while (logical_lane_id % 2 == 0 && size > 1); +``` + +There is no assumption made about the initial state of the reduction. +Any number of lanes (>=1) could be active at any position. The reduction +result is kept in the first active lane. + +In this version specified (=2), the ShuffleReduceFn behaves, per element, as +follows: +``` +//reduce_elem refers to an element in the local ReduceData +//remote_elem is retrieved from a remote lane +remote_elem = shuffle_down(reduce_elem, offset, 32); +if (LaneId % 2 == 0 && Offset > 0) { + reduce_elem = reduce_elem @ remote_elem +} else { + reduce_elem = remote_elem +} +``` +We will proceed with a brief explanation for some arguments passed in, +it is important to notice that, in this section, we will introduce the +concept of logical_lane_id, and it is important to distinguish it +from physical lane_id as defined by nvidia. +1. //logical_lane_id//: as the name suggests, it refers to the calculated + lane_id (instead of the physical one defined by nvidia) that would make + our algorithm logically concise. A thread with logical_lane_id k means + there are (k-1) threads before it. +2. //remote_id-1-threadIdx.x//: remote_id is indeed the nvidia-defined lane + id of the remote lane from which we will retrieve the ReduceData. We + subtract (threadIdx+1) from it because we would like to maintain only one + underlying shuffle intrinsic (which is used to communicate among lanes in a + warp). This particular version of shuffle intrinsic we take accepts only + offsets, instead of absolute lane_id. Therefore the subtraction is performed + on the absolute lane_id we calculated to obtain the offset. + +This algorithm is slightly different in 2 ways and it is not, conceptually, a +generalization of the above algorithms. +1. It reduces elements close to each other. For instance, values in the 0th lane + is to be combined with that of the 1st lane; values in the 2nd lane is to be + combined with that of the 3rd lane. We did not use the previous algorithm + where the first half of the (partial) warp is reduced with the second half + of the (partial) warp. This is because, the mapping + f(x): logical_lane_id -> physical_lane_id; + can be easily calculated whereas its inverse + f^-1(x): physical_lane_id -> logical_lane_id + cannot and performing such reduction requires the inverse to be known. +2. Because this algorithm is agnostic about the positions of the lanes that are + active, we do not need to perform the coping step as in the second + algorithm. +An illustrative run would look like +{F76} +As observed, overhead is high because in each and every step of reduction, +logical_lane_id is recalculated; so is the remote_id. + +On a block level, we have implemented the following block reduce algorithm: + +``` +gpu_irregular_block_reduce(void *reduce_data, + kmp_ShuffleReductFctPtr shuflReduceFn, + kmp_InterWarpCopyFctPtr interWarpCpyFn, + int size) { + + int wid = threadIdx.x/WARPSIZE; + int lane_id = threadIdx.x%WARPSIZE; + + int warp_needed = (size+WARPSIZE-1)/WARPSIZE; //ceiling of division + + unsigned tnum = __ballot(1); + int thread_num = __popc(tnum); + + //full warp reduction + if (thread_num == WARPSIZE) { + gpu_regular_warp_reduce(reduce_data, shuflReduceFn); + } + //partial warp reduction + if (thread_num < WARPSIZE) { + gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, thread_num, + lane_id); + } + //Gather all the reduced values from each warp + //to the first warp + //named_barrier inside this function to ensure + //correctness. It is effectively a sync_thread + //that won't deadlock. + interWarpCpyFn(reduce_data, warp_needed); + + //This is to reduce data gathered from each "warp master". + if (wid==0) { + gpu_irregular_warp_reduce(reduce_data, shuflReduceFn, warp_needed, + lane_id); + } + + return; +} +``` +In this function, no ShuffleReduceFn is directly called as it makes calls +to various versions of the warp-reduction functions. It first reduces +ReduceData warp by warp; in the end, we end up with the number of +ReduceData equal to the number of warps present in this thread +block. We then proceed to gather all such ReduceData to the first warp. + +As observed, in this algorithm we make use of the function InterWarpCpyFn, +which copies data from each of the "warp master" (0th lane of each warp, where +a warp-reduced ReduceData is held) to the 0th warp. This step reduces (in a +mathematical sense) the problem of reduction across warp masters in a block to +the problem of warp reduction which we already have solutions to. + +We can thus completely avoid the use of atomics to reduce in a threadblock. + +**Efficient Cross Block Reduce** + +The next challenge is to reduce values across threadblocks. We aim to do this +without atomics or critical sections. + +Let a kernel be started with TB threadblocks. +Let the GPU have S SMs. +There can be at most N active threadblocks per SM at any time. + +Consider a threadblock tb (tb < TB) running on SM s (s < SM). 'tb' is one of +at most 'N' active threadblocks on SM s. Let each threadblock active on an SM +be given an instance identifier id (0 <= id < N). Therefore, the tuple (s, id) +uniquely identifies an active threadblock on the GPU. + +To efficiently implement cross block reduce, we first allocate an array for +each value to be reduced of size S*N (which is the maximum number of active +threadblocks at any time on the device). + +Each threadblock reduces its value to slot [s][id]. This can be done without +locking since no other threadblock can write to the same slot concurrently. + +As a final stage, we reduce the values in the array as follows: + +``` +// Compiler generated wrapper function for each target region with a reduction +clause. +target_function_wrapper(map_args, reduction_array) <--- start with 1 team and 1 + thread. + // Use dynamic parallelism to launch M teams, N threads as requested by the + user to execute the target region. + + target_function<>(map_args) + + Reduce values in reduction_array + +``` + +**Comparison with Last Version** + + +The (simplified) pseudo code generated by LLVM on the host is as follows: + + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) + where: + struct ReduceData { + double *foo; + double *bar; + } reduceData + reduceData.foo = &foo_p + reduceData.bar = &bar_p + + reduceFn is a pointer to a function that takes in two inputs + of type ReduceData, "reduces" them element wise, and places the + result in the first input: + reduceFn(ReduceData *a, ReduceData *b) + a = a @ b + + Every thread in the parallel region calls kmpc_reduce_nowait with + its private copy of reduceData. The runtime reduces across the + threads (using tree reduction on the operator 'reduceFn?) and stores + the final result in the master thread if successful. + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 2: + In this case kmpc_reduce_nowait() could not use tree reduction, + so use atomics instead: + each thread atomically writes to foo + each thread atomically writes to bar +``` + +On a GPU, a similar reduction may need to be performed across SIMT threads, +warps, and threadblocks. The challenge is to do so efficiently in a fashion +that is compatible with the LLVM OpenMP implementation. + +In the previously released 0.1 version of the LLVM OpenMP compiler for GPUs, +the salient steps of the code generated are as follows: + + +``` + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_nowait(..., reduceData, reduceFn, lock) + status = can_block_reduce() + if status == 1: + reduce efficiently to thread 0 using shuffles and shared memory. + return 1 + else + cannot use efficient block reduction, fallback to atomics + return 2 + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 2: + In this case kmpc_reduce_nowait() could not use tree reduction, + so use atomics instead: + each thread atomically writes to foo + each thread atomically writes to bar +``` + +The function can_block_reduce() is defined as follows: + + +``` +int32_t can_block_reduce() { + int tid = GetThreadIdInTeam(); + int nt = GetNumberOfOmpThreads(tid); + if (nt != blockDim.x) + return 0; + unsigned tnum = __ballot(1); + if (tnum != (~0x0)) { + return 0; + } + return 1; +} +``` + +This function permits the use of the efficient block reduction algorithm +using shuffles and shared memory (return 1) only if (a) all SIMT threads in +a warp are active (i.e., number of threads in the parallel region is a +multiple of 32) and (b) the number of threads in the parallel region +(set by the num_threads clause) equals blockDim.x. + +If either of these preconditions is not true, each thread in the threadblock +updates the global value using atomics. + +Atomics and compare-and-swap operations are expensive on many threaded +architectures such as GPUs and we must avoid them completely. + + +**Appendix: Implementation Details** + + +``` +// Compiler generated function. +reduceFn(ReduceData *a, ReduceData *b) + a->foo = a->foo + b->foo + a->bar = a->bar + b->bar + +// Compiler generated function. +swapAndReduceFn(ReduceData *thread_private, int lane) + ReduceData *remote = new ReduceData() + remote->foo = shuffle_double(thread_private->foo, lane) + remote->bar = shuffle_double(thread_private->bar, lane) + reduceFn(thread_private, remote) + +// OMP runtime function. +warpReduce_regular(ReduceData *thread_private, Fn *swapAndReduceFn): + offset = 16 + while (offset > 0) + swapAndReduceFn(thread_private, offset) + offset /= 2 + +// OMP runtime function. +warpReduce_irregular(): + ... + +// OMP runtime function. +kmpc_reduce_warp(reduceData, swapAndReduceFn) + if all_lanes_active: + warpReduce_regular(reduceData, swapAndReduceFn) + else: + warpReduce_irregular(reduceData, swapAndReduceFn) + if in_simd_region: + // all done, reduce to global in simd lane 0 + return 1 + else if in_parallel_region: + // done reducing to one value per warp, now reduce across warps + return 3 + +// OMP runtime function; one for each basic type. +kmpc_reduce_block_double(double *a) + if lane == 0: + shared[wid] = *a + named_barrier(1, num_threads) + if wid == 0 + block_reduce(shared) + if lane == 0 + *a = shared[0] + named_barrier(1, num_threads) + if wid == 0 and lane == 0 + return 1 // write back reduced result + else + return 0 // don't do anything + +``` + + + +``` +// Compiler generated code. + 1. Create private copies of variables: foo_p, bar_p + 2. Each thread reduces the chunk of A and B assigned to it and writes + to foo_p and bar_p respectively. + 3. ret = kmpc_reduce_warp(reduceData, swapAndReduceFn) + 4. if ret == 1: + The master thread stores the reduced result in the globals. + foo += reduceData.foo; bar += reduceData.bar + 5. else if ret == 3: + ret = block_reduce_double(reduceData.foo) + if ret == 1: + foo += reduceData.foo + ret = block_reduce_double(reduceData.bar) + if ret == 1: + bar += reduceData.bar +``` + +**Notes** + + 1. This scheme requires that the CUDA OMP runtime can call llvm generated + functions. This functionality now works. + 2. If the user inlines the CUDA OMP runtime bitcode, all of the machinery + (including calls through function pointers) are optimized away. + 3. If we are reducing multiple to multiple variables in a parallel region, + the reduce operations are all performed in warpReduce_[ir]regular(). This + results in more instructions in the loop and should result in fewer + stalls due to data dependencies. Unfortunately we cannot do the same in + kmpc_reduce_block_double() without increasing shared memory usage. diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h index c5e91c5bf5270..fa232a6ed8d06 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/nvptx_interface.h @@ -1,18 +1,18 @@ -//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _NVPTX_INTERFACE_H_ -#define _NVPTX_INTERFACE_H_ - -#include - -#define EXTERN extern "C" __device__ -typedef uint32_t __kmpc_impl_lanemask_t; -typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ - -#endif +//===--- nvptx_interface.h - OpenMP interface definitions -------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _NVPTX_INTERFACE_H_ +#define _NVPTX_INTERFACE_H_ + +#include + +#define EXTERN extern "C" __device__ +typedef uint32_t __kmpc_impl_lanemask_t; +typedef uint32_t omp_lock_t; /* arbitrary type of the right length */ + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu index 50867bc4010af..320d7a56434fb 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.cu @@ -1,50 +1,50 @@ -//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definitions of target specific functions -// -//===----------------------------------------------------------------------===// - -#include "target_impl.h" -#include "common/debug.h" -#include "common/target_atomic.h" - -#define __OMP_SPIN 1000 -#define UNSET 0u -#define SET 1u - -EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) { - __kmpc_impl_unset_lock(lock); -} - -EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) { - __kmpc_impl_unset_lock(lock); -} - -EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) { - // TODO: not sure spinning is a good idea here.. - while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) { - clock_t start = clock(); - clock_t now; - for (;;) { - now = clock(); - clock_t cycles = now > start ? now - start : now + (0xffffffff - start); - if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { - break; - } - } - } // wait for 0 to be the read value -} - -EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) { - (void)__kmpc_atomic_exchange(lock, UNSET); -} - -EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) { - return __kmpc_atomic_add(lock, 0u); -} +//===---------- target_impl.cu - NVPTX OpenMP GPU options ------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definitions of target specific functions +// +//===----------------------------------------------------------------------===// + +#include "target_impl.h" +#include "common/debug.h" +#include "common/target_atomic.h" + +#define __OMP_SPIN 1000 +#define UNSET 0u +#define SET 1u + +EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock) { + __kmpc_impl_unset_lock(lock); +} + +EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock) { + __kmpc_impl_unset_lock(lock); +} + +EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock) { + // TODO: not sure spinning is a good idea here.. + while (__kmpc_atomic_cas(lock, UNSET, SET) != UNSET) { + clock_t start = clock(); + clock_t now; + for (;;) { + now = clock(); + clock_t cycles = now > start ? now - start : now + (0xffffffff - start); + if (cycles >= __OMP_SPIN * GetBlockIdInKernel()) { + break; + } + } + } // wait for 0 to be the read value +} + +EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock) { + (void)__kmpc_atomic_exchange(lock, UNSET); +} + +EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock) { + return __kmpc_atomic_add(lock, 0u); +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h index 1b966510ec7ef..032943fe4e063 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h +++ b/openmp/libomptarget/deviceRTLs/nvptx/src/target_impl.h @@ -1,218 +1,218 @@ -//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Definitions of target specific functions -// -//===----------------------------------------------------------------------===// -#ifndef _TARGET_IMPL_H_ -#define _TARGET_IMPL_H_ - -#include -#include -#include -#include -#include - -#include "nvptx_interface.h" - -#define DEVICE __device__ -#define INLINE __forceinline__ DEVICE -#define NOINLINE __noinline__ DEVICE -#define SHARED __shared__ -#define ALIGN(N) __align__(N) - -//////////////////////////////////////////////////////////////////////////////// -// Kernel options -//////////////////////////////////////////////////////////////////////////////// - -//////////////////////////////////////////////////////////////////////////////// -// The following def must match the absolute limit hardwired in the host RTL -// max number of threads per team -#define MAX_THREADS_PER_TEAM 1024 - -#define WARPSIZE 32 - -// The named barrier for active parallel threads of a team in an L1 parallel -// region to synchronize with each other. -#define L1_BARRIER (1) - -// Maximum number of preallocated arguments to an outlined parallel/simd function. -// Anything more requires dynamic memory allocation. -#define MAX_SHARED_ARGS 20 - -// Maximum number of omp state objects per SM allocated statically in global -// memory. -#if __CUDA_ARCH__ >= 700 -#define OMP_STATE_COUNT 32 -#define MAX_SM 84 -#elif __CUDA_ARCH__ >= 600 -#define OMP_STATE_COUNT 32 -#define MAX_SM 56 -#else -#define OMP_STATE_COUNT 16 -#define MAX_SM 16 -#endif - -#define OMP_ACTIVE_PARALLEL_LEVEL 128 - -// Data sharing related quantities, need to match what is used in the compiler. -enum DATA_SHARING_SIZES { - // The maximum number of workers in a kernel. - DS_Max_Worker_Threads = 992, - // The size reserved for data in a shared memory slot. - DS_Slot_Size = 256, - // The slot size that should be reserved for a working warp. - DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, - // The maximum number of warps in use - DS_Max_Warp_Number = 32, - // The size of the preallocated shared memory buffer per team - DS_Shared_Memory_Size = 128, -}; - -INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { - asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); -} - -INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { - uint64_t val; - asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); - return val; -} - -static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes = - UINT32_C(0xffffffff); - -INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { - __kmpc_impl_lanemask_t res; - asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res)); - return res; -} - -INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { - __kmpc_impl_lanemask_t res; - asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res)); - return res; -} - -INLINE uint32_t __kmpc_impl_smid() { - uint32_t id; - asm("mov.u32 %0, %%smid;" : "=r"(id)); - return id; -} - -INLINE double __kmpc_impl_get_wtick() { - // Timer precision is 1ns - return ((double)1E-9); -} - -INLINE double __kmpc_impl_get_wtime() { - unsigned long long nsecs; - asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); - return (double)nsecs * __kmpc_impl_get_wtick(); -} - -INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } - -INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); } - -template INLINE T __kmpc_impl_min(T x, T y) { - return min(x, y); -} - -#ifndef CUDA_VERSION -#error CUDA_VERSION macro is undefined, something wrong with cuda. -#endif - -// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). - -INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { -#if CUDA_VERSION >= 9000 - return __activemask(); -#else - return __ballot(1); -#endif -} - -// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. - -INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, - int32_t SrcLane) { -#if CUDA_VERSION >= 9000 - return __shfl_sync(Mask, Var, SrcLane); -#else - return __shfl(Var, SrcLane); -#endif // CUDA_VERSION -} - -INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, - int32_t Var, uint32_t Delta, - int32_t Width) { -#if CUDA_VERSION >= 9000 - return __shfl_down_sync(Mask, Var, Delta, Width); -#else - return __shfl_down(Var, Delta, Width); -#endif // CUDA_VERSION -} - -INLINE void __kmpc_impl_syncthreads() { - // Use original __syncthreads if compiled by nvcc or clang >= 9.0. -#if !defined(__clang__) || __clang_major__ >= 9 - __syncthreads(); -#else - asm volatile("bar.sync %0;" : : "r"(0) : "memory"); -#endif // __clang__ -} - -INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { -#if CUDA_VERSION >= 9000 - __syncwarp(Mask); -#else - // In Cuda < 9.0 no need to sync threads in warps. -#endif // CUDA_VERSION -} - -INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) { - asm volatile("bar.sync %0, %1;" - : - : "r"(barrier), "r"(num_threads) - : "memory"); -} - -INLINE void __kmpc_impl_threadfence(void) { __threadfence(); } -INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); } -INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); } - -// Calls to the NVPTX layer (assuming 1D layout) -INLINE int GetThreadIdInBlock() { return threadIdx.x; } -INLINE int GetBlockIdInKernel() { return blockIdx.x; } -INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } -INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } -INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } -INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } - -// Return true if this is the first active thread in the warp. -INLINE bool __kmpc_impl_is_first_active_thread() { - unsigned long long Mask = __kmpc_impl_activemask(); - unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); - unsigned long long Sh = Mask << ShNum; - // Truncate Sh to the 32 lower bits - return (unsigned)Sh == 0; -} - -// Locks -EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock); -EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock); -EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock); - -// Memory -INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); } -INLINE void __kmpc_impl_free(void *x) { free(x); } - -#endif +//===------------ target_impl.h - NVPTX OpenMP GPU options ------- CUDA -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Definitions of target specific functions +// +//===----------------------------------------------------------------------===// +#ifndef _TARGET_IMPL_H_ +#define _TARGET_IMPL_H_ + +#include +#include +#include +#include +#include + +#include "nvptx_interface.h" + +#define DEVICE __device__ +#define INLINE __forceinline__ DEVICE +#define NOINLINE __noinline__ DEVICE +#define SHARED __shared__ +#define ALIGN(N) __align__(N) + +//////////////////////////////////////////////////////////////////////////////// +// Kernel options +//////////////////////////////////////////////////////////////////////////////// + +//////////////////////////////////////////////////////////////////////////////// +// The following def must match the absolute limit hardwired in the host RTL +// max number of threads per team +#define MAX_THREADS_PER_TEAM 1024 + +#define WARPSIZE 32 + +// The named barrier for active parallel threads of a team in an L1 parallel +// region to synchronize with each other. +#define L1_BARRIER (1) + +// Maximum number of preallocated arguments to an outlined parallel/simd function. +// Anything more requires dynamic memory allocation. +#define MAX_SHARED_ARGS 20 + +// Maximum number of omp state objects per SM allocated statically in global +// memory. +#if __CUDA_ARCH__ >= 700 +#define OMP_STATE_COUNT 32 +#define MAX_SM 84 +#elif __CUDA_ARCH__ >= 600 +#define OMP_STATE_COUNT 32 +#define MAX_SM 56 +#else +#define OMP_STATE_COUNT 16 +#define MAX_SM 16 +#endif + +#define OMP_ACTIVE_PARALLEL_LEVEL 128 + +// Data sharing related quantities, need to match what is used in the compiler. +enum DATA_SHARING_SIZES { + // The maximum number of workers in a kernel. + DS_Max_Worker_Threads = 992, + // The size reserved for data in a shared memory slot. + DS_Slot_Size = 256, + // The slot size that should be reserved for a working warp. + DS_Worker_Warp_Slot_Size = WARPSIZE * DS_Slot_Size, + // The maximum number of warps in use + DS_Max_Warp_Number = 32, + // The size of the preallocated shared memory buffer per team + DS_Shared_Memory_Size = 128, +}; + +INLINE void __kmpc_impl_unpack(uint64_t val, uint32_t &lo, uint32_t &hi) { + asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); +} + +INLINE uint64_t __kmpc_impl_pack(uint32_t lo, uint32_t hi) { + uint64_t val; + asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); + return val; +} + +static const __kmpc_impl_lanemask_t __kmpc_impl_all_lanes = + UINT32_C(0xffffffff); + +INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_lt() { + __kmpc_impl_lanemask_t res; + asm("mov.u32 %0, %%lanemask_lt;" : "=r"(res)); + return res; +} + +INLINE __kmpc_impl_lanemask_t __kmpc_impl_lanemask_gt() { + __kmpc_impl_lanemask_t res; + asm("mov.u32 %0, %%lanemask_gt;" : "=r"(res)); + return res; +} + +INLINE uint32_t __kmpc_impl_smid() { + uint32_t id; + asm("mov.u32 %0, %%smid;" : "=r"(id)); + return id; +} + +INLINE double __kmpc_impl_get_wtick() { + // Timer precision is 1ns + return ((double)1E-9); +} + +INLINE double __kmpc_impl_get_wtime() { + unsigned long long nsecs; + asm("mov.u64 %0, %%globaltimer;" : "=l"(nsecs)); + return (double)nsecs * __kmpc_impl_get_wtick(); +} + +INLINE uint32_t __kmpc_impl_ffs(uint32_t x) { return __ffs(x); } + +INLINE uint32_t __kmpc_impl_popc(uint32_t x) { return __popc(x); } + +template INLINE T __kmpc_impl_min(T x, T y) { + return min(x, y); +} + +#ifndef CUDA_VERSION +#error CUDA_VERSION macro is undefined, something wrong with cuda. +#endif + +// In Cuda 9.0, __ballot(1) from Cuda 8.0 is replaced with __activemask(). + +INLINE __kmpc_impl_lanemask_t __kmpc_impl_activemask() { +#if CUDA_VERSION >= 9000 + return __activemask(); +#else + return __ballot(1); +#endif +} + +// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'. + +INLINE int32_t __kmpc_impl_shfl_sync(__kmpc_impl_lanemask_t Mask, int32_t Var, + int32_t SrcLane) { +#if CUDA_VERSION >= 9000 + return __shfl_sync(Mask, Var, SrcLane); +#else + return __shfl(Var, SrcLane); +#endif // CUDA_VERSION +} + +INLINE int32_t __kmpc_impl_shfl_down_sync(__kmpc_impl_lanemask_t Mask, + int32_t Var, uint32_t Delta, + int32_t Width) { +#if CUDA_VERSION >= 9000 + return __shfl_down_sync(Mask, Var, Delta, Width); +#else + return __shfl_down(Var, Delta, Width); +#endif // CUDA_VERSION +} + +INLINE void __kmpc_impl_syncthreads() { + // Use original __syncthreads if compiled by nvcc or clang >= 9.0. +#if !defined(__clang__) || __clang_major__ >= 9 + __syncthreads(); +#else + asm volatile("bar.sync %0;" : : "r"(0) : "memory"); +#endif // __clang__ +} + +INLINE void __kmpc_impl_syncwarp(__kmpc_impl_lanemask_t Mask) { +#if CUDA_VERSION >= 9000 + __syncwarp(Mask); +#else + // In Cuda < 9.0 no need to sync threads in warps. +#endif // CUDA_VERSION +} + +INLINE void __kmpc_impl_named_sync(int barrier, uint32_t num_threads) { + asm volatile("bar.sync %0, %1;" + : + : "r"(barrier), "r"(num_threads) + : "memory"); +} + +INLINE void __kmpc_impl_threadfence(void) { __threadfence(); } +INLINE void __kmpc_impl_threadfence_block(void) { __threadfence_block(); } +INLINE void __kmpc_impl_threadfence_system(void) { __threadfence_system(); } + +// Calls to the NVPTX layer (assuming 1D layout) +INLINE int GetThreadIdInBlock() { return threadIdx.x; } +INLINE int GetBlockIdInKernel() { return blockIdx.x; } +INLINE int GetNumberOfBlocksInKernel() { return gridDim.x; } +INLINE int GetNumberOfThreadsInBlock() { return blockDim.x; } +INLINE unsigned GetWarpId() { return GetThreadIdInBlock() / WARPSIZE; } +INLINE unsigned GetLaneId() { return GetThreadIdInBlock() & (WARPSIZE - 1); } + +// Return true if this is the first active thread in the warp. +INLINE bool __kmpc_impl_is_first_active_thread() { + unsigned long long Mask = __kmpc_impl_activemask(); + unsigned long long ShNum = WARPSIZE - (GetThreadIdInBlock() % WARPSIZE); + unsigned long long Sh = Mask << ShNum; + // Truncate Sh to the 32 lower bits + return (unsigned)Sh == 0; +} + +// Locks +EXTERN void __kmpc_impl_init_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_destroy_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_set_lock(omp_lock_t *lock); +EXTERN void __kmpc_impl_unset_lock(omp_lock_t *lock); +EXTERN int __kmpc_impl_test_lock(omp_lock_t *lock); + +// Memory +INLINE void *__kmpc_impl_malloc(size_t x) { return malloc(x); } +INLINE void __kmpc_impl_free(void *x) { free(x); } + +#endif diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt b/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt index 1eabeb25ff98b..40cb35e6cc028 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/CMakeLists.txt @@ -1,25 +1,25 @@ -if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang") - # Silently return, no need to annoy the user. - return() -endif() - -set(deps omptarget-nvptx omptarget omp) -if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB) - set(deps ${deps} omptarget-nvptx-bc) -endif() - -# Run with only one thread to only launch one application to the GPU at a time. -add_openmp_testsuite(check-libomptarget-nvptx - "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR} - EXCLUDE_FROM_CHECK_ALL - DEPENDS ${deps} ARGS -j1) - -set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING - "Extra compiler flags to send to the test compiler.") -set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS - "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING - "OpenMP compiler flags to use for testing libomptarget-nvptx.") - -# Configure the lit.site.cfg.in file -set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!") -configure_file(lit.site.cfg.in lit.site.cfg @ONLY) +if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang") + # Silently return, no need to annoy the user. + return() +endif() + +set(deps omptarget-nvptx omptarget omp) +if(LIBOMPTARGET_NVPTX_ENABLE_BCLIB) + set(deps ${deps} omptarget-nvptx-bc) +endif() + +# Run with only one thread to only launch one application to the GPU at a time. +add_openmp_testsuite(check-libomptarget-nvptx + "Running libomptarget-nvptx tests" ${CMAKE_CURRENT_BINARY_DIR} + EXCLUDE_FROM_CHECK_ALL + DEPENDS ${deps} ARGS -j1) + +set(LIBOMPTARGET_NVPTX_TEST_FLAGS "" CACHE STRING + "Extra compiler flags to send to the test compiler.") +set(LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS + "-fopenmp -fopenmp-targets=nvptx64-nvidia-cuda" CACHE STRING + "OpenMP compiler flags to use for testing libomptarget-nvptx.") + +# Configure the lit.site.cfg.in file +set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget-nvptx configuration.\n# Do not edit!") +configure_file(lit.site.cfg.in lit.site.cfg @ONLY) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c index 60254bc7ed2e2..58a16b8e82daf 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/get_max_threads.c @@ -1,22 +1,22 @@ -// RUN: %compile-run-and-check -#include -#include - -int main(){ - int max_threads = -1; - int num_threads = -1; - - #pragma omp target map(tofrom: max_threads) - max_threads = omp_get_max_threads(); - - #pragma omp target parallel map(tofrom: num_threads) - { - #pragma omp master - num_threads = omp_get_num_threads(); - } - - // CHECK: Max Threads: 128, Num Threads: 128 - printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads); - - return 0; -} +// RUN: %compile-run-and-check +#include +#include + +int main(){ + int max_threads = -1; + int num_threads = -1; + + #pragma omp target map(tofrom: max_threads) + max_threads = omp_get_max_threads(); + + #pragma omp target parallel map(tofrom: num_threads) + { + #pragma omp master + num_threads = omp_get_num_threads(); + } + + // CHECK: Max Threads: 128, Num Threads: 128 + printf("Max Threads: %d, Num Threads: %d\n", max_threads, num_threads); + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c index 1fa9ae024f6f5..657aad915bea6 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/ignored.c @@ -1,38 +1,38 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int MaxThreads = 1024; - -int main(int argc, char *argv[]) { - int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1; - - #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels) - { - // libomptarget-nvptx doesn't support cancellation. - cancellation = omp_get_cancellation(); - - // No support for dynamic adjustment of the number of threads. - omp_set_dynamic(1); - dynamic = omp_get_dynamic(); - - // libomptarget-nvptx doesn't support nested parallelism. - omp_set_nested(1); - nested = omp_get_nested(); - - omp_set_max_active_levels(42); - maxActiveLevels = omp_get_max_active_levels(); - } - - // CHECK: cancellation = 0 - printf("cancellation = %d\n", cancellation); - // CHECK: dynamic = 0 - printf("dynamic = %d\n", dynamic); - // CHECK: nested = 0 - printf("nested = %d\n", nested); - // CHECK: maxActiveLevels = 1 - printf("maxActiveLevels = %d\n", maxActiveLevels); - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +const int MaxThreads = 1024; + +int main(int argc, char *argv[]) { + int cancellation = -1, dynamic = -1, nested = -1, maxActiveLevels = -1; + + #pragma omp target map(cancellation, dynamic, nested, maxActiveLevels) + { + // libomptarget-nvptx doesn't support cancellation. + cancellation = omp_get_cancellation(); + + // No support for dynamic adjustment of the number of threads. + omp_set_dynamic(1); + dynamic = omp_get_dynamic(); + + // libomptarget-nvptx doesn't support nested parallelism. + omp_set_nested(1); + nested = omp_get_nested(); + + omp_set_max_active_levels(42); + maxActiveLevels = omp_get_max_active_levels(); + } + + // CHECK: cancellation = 0 + printf("cancellation = %d\n", cancellation); + // CHECK: dynamic = 0 + printf("dynamic = %d\n", dynamic); + // CHECK: nested = 0 + printf("nested = %d\n", nested); + // CHECK: maxActiveLevels = 1 + printf("maxActiveLevels = %d\n", maxActiveLevels); + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c index efb418fef9a0b..d9fd0b86f0e35 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/max_threads.c @@ -1,53 +1,53 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int MaxThreadsL1 = -1, MaxThreadsL2 = -1; - -#pragma omp declare reduction(unique:int \ - : omp_out = (omp_in == 1 ? omp_in : omp_out)) \ - initializer(omp_priv = -1) - - // Non-SPMD mode. -#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32) \ - num_teams(1) - { - MaxThreadsL1 = omp_get_max_threads(); -#pragma omp parallel reduction(unique : MaxThreadsL2) - { MaxThreadsL2 = omp_get_max_threads(); } - } - - //FIXME: This Non-SPMD kernel will have 32 active threads due to - // thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of - // threads in block (64 in this case), which translates to worker - // threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD - // kernels. According to the spec, omp_get_max_threads must return the - // max active threads possible between the two kernel types. - - // CHECK: Non-SPMD MaxThreadsL1 = 64 - printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1); - // CHECK: Non-SPMD MaxThreadsL2 = 1 - printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2); - - // SPMD mode with full runtime - MaxThreadsL2 = -1; -#pragma omp target parallel reduction(unique : MaxThreadsL2) - { MaxThreadsL2 = omp_get_max_threads(); } - - // CHECK: SPMD with full runtime MaxThreadsL2 = 1 - printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2); - - // SPMD mode without runtime - MaxThreadsL2 = -1; -#pragma omp target parallel for reduction(unique : MaxThreadsL2) - for (int I = 0; I < 2; ++I) { - MaxThreadsL2 = omp_get_max_threads(); - } - - // CHECK: SPMD without runtime MaxThreadsL2 = 1 - printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2); - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +int main(int argc, char *argv[]) { + int MaxThreadsL1 = -1, MaxThreadsL2 = -1; + +#pragma omp declare reduction(unique:int \ + : omp_out = (omp_in == 1 ? omp_in : omp_out)) \ + initializer(omp_priv = -1) + + // Non-SPMD mode. +#pragma omp target teams map(MaxThreadsL1, MaxThreadsL2) thread_limit(32) \ + num_teams(1) + { + MaxThreadsL1 = omp_get_max_threads(); +#pragma omp parallel reduction(unique : MaxThreadsL2) + { MaxThreadsL2 = omp_get_max_threads(); } + } + + //FIXME: This Non-SPMD kernel will have 32 active threads due to + // thread_limit. However, Non-SPMD MaxThreadsL1 is the total number of + // threads in block (64 in this case), which translates to worker + // threads + WARP_SIZE for Non-SPMD kernels and worker threads for SPMD + // kernels. According to the spec, omp_get_max_threads must return the + // max active threads possible between the two kernel types. + + // CHECK: Non-SPMD MaxThreadsL1 = 64 + printf("Non-SPMD MaxThreadsL1 = %d\n", MaxThreadsL1); + // CHECK: Non-SPMD MaxThreadsL2 = 1 + printf("Non-SPMD MaxThreadsL2 = %d\n", MaxThreadsL2); + + // SPMD mode with full runtime + MaxThreadsL2 = -1; +#pragma omp target parallel reduction(unique : MaxThreadsL2) + { MaxThreadsL2 = omp_get_max_threads(); } + + // CHECK: SPMD with full runtime MaxThreadsL2 = 1 + printf("SPMD with full runtime MaxThreadsL2 = %d\n", MaxThreadsL2); + + // SPMD mode without runtime + MaxThreadsL2 = -1; +#pragma omp target parallel for reduction(unique : MaxThreadsL2) + for (int I = 0; I < 2; ++I) { + MaxThreadsL2 = omp_get_max_threads(); + } + + // CHECK: SPMD without runtime MaxThreadsL2 = 1 + printf("SPMD without runtime MaxThreadsL2 = %d\n", MaxThreadsL2); + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c index 626d620dc4f3a..33ed6d5735d62 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/api/thread_limit.c @@ -1,72 +1,72 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1; - -#pragma omp declare reduction(unique64:int \ - : omp_out = (omp_in == 64 ? omp_in : omp_out)) \ - initializer(omp_priv = -1) -#pragma omp declare reduction(unique32:int \ - : omp_out = (omp_in == 32 ? omp_in : omp_out)) \ - initializer(omp_priv = -1) - - // Non-SPMD mode. -#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2) \ - thread_limit(64) num_teams(1) - { - ThreadLimitL0 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique64 \ - : ThreadLimitL1, ThreadLimitL2) num_threads(32) - { - ThreadLimitL1 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique64 : ThreadLimitL2) - { ThreadLimitL2 = omp_get_thread_limit(); } - } - } - - // CHECK: Non-SPMD ThreadLimitL0 = 64 - printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0); - // CHECK: Non-SPMD ThreadLimitL1 = 64 - printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1); - // CHECK: Non-SPMD ThreadLimitL2 = 64 - printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2); - - // SPMD mode with full runtime - ThreadLimitL1 = -1; - ThreadLimitL2 = -1; -#pragma omp target parallel reduction(unique32 \ - : ThreadLimitL1, ThreadLimitL2) \ - num_threads(32) - { - ThreadLimitL1 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique32 : ThreadLimitL2) - { ThreadLimitL2 = omp_get_thread_limit(); } - } - - // CHECK: SPMD with full runtime ThreadLimitL1 = 32 - printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1); - // CHECK: SPMD with full runtime ThreadLimitL2 = 32 - printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2); - - // SPMD mode without runtime - ThreadLimitL1 = -1; - ThreadLimitL2 = -1; -#pragma omp target parallel for reduction(unique32 \ - : ThreadLimitL1, ThreadLimitL2) \ - num_threads(32) - for (int I = 0; I < 2; ++I) { - ThreadLimitL1 = omp_get_thread_limit(); -#pragma omp parallel reduction(unique32 : ThreadLimitL2) - { ThreadLimitL2 = omp_get_thread_limit(); } - } - - // CHECK: SPMD without runtime ThreadLimitL1 = 32 - printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1); - // CHECK: SPMD without runtime ThreadLimitL2 = 32 - printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2); - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +int main(int argc, char *argv[]) { + int ThreadLimitL0 = -1, ThreadLimitL1 = -1, ThreadLimitL2 = -1; + +#pragma omp declare reduction(unique64:int \ + : omp_out = (omp_in == 64 ? omp_in : omp_out)) \ + initializer(omp_priv = -1) +#pragma omp declare reduction(unique32:int \ + : omp_out = (omp_in == 32 ? omp_in : omp_out)) \ + initializer(omp_priv = -1) + + // Non-SPMD mode. +#pragma omp target teams map(ThreadLimitL0, ThreadLimitL1, ThreadLimitL2) \ + thread_limit(64) num_teams(1) + { + ThreadLimitL0 = omp_get_thread_limit(); +#pragma omp parallel reduction(unique64 \ + : ThreadLimitL1, ThreadLimitL2) num_threads(32) + { + ThreadLimitL1 = omp_get_thread_limit(); +#pragma omp parallel reduction(unique64 : ThreadLimitL2) + { ThreadLimitL2 = omp_get_thread_limit(); } + } + } + + // CHECK: Non-SPMD ThreadLimitL0 = 64 + printf("Non-SPMD ThreadLimitL0 = %d\n", ThreadLimitL0); + // CHECK: Non-SPMD ThreadLimitL1 = 64 + printf("Non-SPMD ThreadLimitL1 = %d\n", ThreadLimitL1); + // CHECK: Non-SPMD ThreadLimitL2 = 64 + printf("Non-SPMD ThreadLimitL2 = %d\n", ThreadLimitL2); + + // SPMD mode with full runtime + ThreadLimitL1 = -1; + ThreadLimitL2 = -1; +#pragma omp target parallel reduction(unique32 \ + : ThreadLimitL1, ThreadLimitL2) \ + num_threads(32) + { + ThreadLimitL1 = omp_get_thread_limit(); +#pragma omp parallel reduction(unique32 : ThreadLimitL2) + { ThreadLimitL2 = omp_get_thread_limit(); } + } + + // CHECK: SPMD with full runtime ThreadLimitL1 = 32 + printf("SPMD with full runtime ThreadLimitL1 = %d\n", ThreadLimitL1); + // CHECK: SPMD with full runtime ThreadLimitL2 = 32 + printf("SPMD with full runtime ThreadLimitL2 = %d\n", ThreadLimitL2); + + // SPMD mode without runtime + ThreadLimitL1 = -1; + ThreadLimitL2 = -1; +#pragma omp target parallel for reduction(unique32 \ + : ThreadLimitL1, ThreadLimitL2) \ + num_threads(32) + for (int I = 0; I < 2; ++I) { + ThreadLimitL1 = omp_get_thread_limit(); +#pragma omp parallel reduction(unique32 : ThreadLimitL2) + { ThreadLimitL2 = omp_get_thread_limit(); } + } + + // CHECK: SPMD without runtime ThreadLimitL1 = 32 + printf("SPMD without runtime ThreadLimitL1 = %d\n", ThreadLimitL1); + // CHECK: SPMD without runtime ThreadLimitL2 = 32 + printf("SPMD without runtime ThreadLimitL2 = %d\n", ThreadLimitL2); + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c b/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c index dd17ae7c6a76c..d675087ed4319 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/data_sharing/alignment.c @@ -1,55 +1,55 @@ -// RUN: %compile-run-and-check - -#include -#include - -#pragma omp declare target -static void putValueInParallel(int *ptr, int value) { - #pragma omp parallel - { - *ptr = value; - } -} - -static int getId() { - int id; - putValueInParallel(&id, omp_get_thread_num()); - return id; -} -#pragma omp end declare target - -const int MaxThreads = 1024; -const int Threads = 64; - -int main(int argc, char *argv[]) { - int master; - int check[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check[i] = 0; - } - - #pragma omp target map(master, check[:]) - { - master = getId(); - - #pragma omp parallel num_threads(Threads) - { - check[omp_get_thread_num()] = getId(); - } - } - - // CHECK: master = 0. - printf("master = %d.\n", master); - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - if (i < Threads) { - if (check[i] != i) { - printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]); - } - } else if (check[i] != 0) { - printf("invalid: check[%d] should be 0, is %d\n", i, check[i]); - } - } - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +#pragma omp declare target +static void putValueInParallel(int *ptr, int value) { + #pragma omp parallel + { + *ptr = value; + } +} + +static int getId() { + int id; + putValueInParallel(&id, omp_get_thread_num()); + return id; +} +#pragma omp end declare target + +const int MaxThreads = 1024; +const int Threads = 64; + +int main(int argc, char *argv[]) { + int master; + int check[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check[i] = 0; + } + + #pragma omp target map(master, check[:]) + { + master = getId(); + + #pragma omp parallel num_threads(Threads) + { + check[omp_get_thread_num()] = getId(); + } + } + + // CHECK: master = 0. + printf("master = %d.\n", master); + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + if (i < Threads) { + if (check[i] != i) { + printf("invalid: check[%d] should be %d, is %d\n", i, i, check[i]); + } + } else if (check[i] != 0) { + printf("invalid: check[%d] should be 0, is %d\n", i, check[i]); + } + } + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg index 0774c25af20c2..5d89ac74ac59b 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.cfg @@ -1,69 +1,69 @@ -# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: -# Configuration file for the 'lit' test runner. - -import os -import lit.formats - -# Tell pylint that we know config and lit_config exist somewhere. -if 'PYLINT_IMPORT' in os.environ: - config = object() - lit_config = object() - -def prepend_library_path(name, value, sep): - if name in config.environment: - config.environment[name] = value + sep + config.environment[name] - else: - config.environment[name] = value - -# name: The name of this test suite. -config.name = 'libomptarget-nvptx' - -# suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.cc'] - -# test_source_root: The root path where tests are located. -config.test_source_root = os.path.dirname(__file__) - -# test_exec_root: The root object directory where output is placed -config.test_exec_root = config.binary_dir - -# test format -config.test_format = lit.formats.ShTest() - -# compiler flags -config.test_flags = " -I " + config.omp_header_directory + \ - " -L " + config.library_dir + \ - " --libomptarget-nvptx-path=" + config.library_dir; - -if config.omp_host_rtl_directory: - config.test_flags = config.test_flags + \ - " -L " + config.omp_host_rtl_directory - -config.test_flags = config.test_flags + " " + config.test_extra_flags - -# Setup environment to find dynamic library at runtime. -prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":") -prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":") - -# Forbid fallback to host. -config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY" - -# substitutions -config.substitutions.append(("%compilexx-run-and-check", - "%compilexx-and-run | " + config.libomptarget_filecheck + " %s")) -config.substitutions.append(("%compile-run-and-check", - "%compile-and-run | " + config.libomptarget_filecheck + " %s")) -config.substitutions.append(("%compilexx-and-run", "%compilexx && %run")) -config.substitutions.append(("%compile-and-run", "%compile && %run")) - -config.substitutions.append(("%compilexx", - "%clangxx %openmp_flags %flags %s -o %t")) -config.substitutions.append(("%compile", - "%clang %openmp_flags %flags %s -o %t")) - -config.substitutions.append(("%clangxx", config.test_cxx_compiler)) -config.substitutions.append(("%clang", config.test_c_compiler)) -config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) -config.substitutions.append(("%flags", config.test_flags)) - -config.substitutions.append(("%run", "%t")) +# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: +# Configuration file for the 'lit' test runner. + +import os +import lit.formats + +# Tell pylint that we know config and lit_config exist somewhere. +if 'PYLINT_IMPORT' in os.environ: + config = object() + lit_config = object() + +def prepend_library_path(name, value, sep): + if name in config.environment: + config.environment[name] = value + sep + config.environment[name] + else: + config.environment[name] = value + +# name: The name of this test suite. +config.name = 'libomptarget-nvptx' + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp', '.cc'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root object directory where output is placed +config.test_exec_root = config.binary_dir + +# test format +config.test_format = lit.formats.ShTest() + +# compiler flags +config.test_flags = " -I " + config.omp_header_directory + \ + " -L " + config.library_dir + \ + " --libomptarget-nvptx-path=" + config.library_dir; + +if config.omp_host_rtl_directory: + config.test_flags = config.test_flags + \ + " -L " + config.omp_host_rtl_directory + +config.test_flags = config.test_flags + " " + config.test_extra_flags + +# Setup environment to find dynamic library at runtime. +prepend_library_path('LD_LIBRARY_PATH', config.library_dir, ":") +prepend_library_path('LD_LIBRARY_PATH', config.omp_host_rtl_directory, ":") + +# Forbid fallback to host. +config.environment["OMP_TARGET_OFFLOAD"] = "MANDATORY" + +# substitutions +config.substitutions.append(("%compilexx-run-and-check", + "%compilexx-and-run | " + config.libomptarget_filecheck + " %s")) +config.substitutions.append(("%compile-run-and-check", + "%compile-and-run | " + config.libomptarget_filecheck + " %s")) +config.substitutions.append(("%compilexx-and-run", "%compilexx && %run")) +config.substitutions.append(("%compile-and-run", "%compile && %run")) + +config.substitutions.append(("%compilexx", + "%clangxx %openmp_flags %flags %s -o %t")) +config.substitutions.append(("%compile", + "%clang %openmp_flags %flags %s -o %t")) + +config.substitutions.append(("%clangxx", config.test_cxx_compiler)) +config.substitutions.append(("%clang", config.test_c_compiler)) +config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) +config.substitutions.append(("%flags", config.test_flags)) + +config.substitutions.append(("%run", "%t")) diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in index d9c14cbc53262..709ef1ce844c6 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/lit.site.cfg.in @@ -1,14 +1,14 @@ -@AUTO_GEN_COMMENT@ - -config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" -config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" -config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@" -config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@" -config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@" -config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" -config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" -config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" -config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@" - -# Let the main config do the real work. -lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") +@AUTO_GEN_COMMENT@ + +config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" +config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" +config.test_openmp_flags = "@LIBOMPTARGET_NVPTX_TEST_OPENMP_FLAGS@" +config.test_extra_flags = "@LIBOMPTARGET_NVPTX_TEST_FLAGS@" +config.binary_dir = "@CMAKE_CURRENT_BINARY_DIR@" +config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" +config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" +config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" +config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@" + +# Let the main config do the real work. +lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c index 7c707718e13bd..3a2149f858b99 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/barrier.c @@ -1,37 +1,37 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int data, out, flag = 0; -#pragma omp target teams num_teams(2) map(tofrom \ - : out) map(to \ - : data, flag) \ - thread_limit(1) -#pragma omp parallel num_threads(1) - { - if (omp_get_team_num() == 0) { - /* Write to the data buffer that will be read by thread in team 1 */ - data = 42; -/* Flush data to thread in team 1 */ -#pragma omp barrier - /* Set flag to release thread in team 1 */ -#pragma omp atomic write - flag = 1; - } else if (omp_get_team_num() == 1) { - /* Loop until we see the update to the flag */ - int val; - do { -#pragma omp atomic read - val = flag; - } while (val < 1); - out = data; -#pragma omp barrier - } - } - // CHECK: out=42. - /* Value of out will be 42 */ - printf("out=%d.\n", out); - return !(out == 42); -} +// RUN: %compile-run-and-check + +#include +#include + +int main(int argc, char *argv[]) { + int data, out, flag = 0; +#pragma omp target teams num_teams(2) map(tofrom \ + : out) map(to \ + : data, flag) \ + thread_limit(1) +#pragma omp parallel num_threads(1) + { + if (omp_get_team_num() == 0) { + /* Write to the data buffer that will be read by thread in team 1 */ + data = 42; +/* Flush data to thread in team 1 */ +#pragma omp barrier + /* Set flag to release thread in team 1 */ +#pragma omp atomic write + flag = 1; + } else if (omp_get_team_num() == 1) { + /* Loop until we see the update to the flag */ + int val; + do { +#pragma omp atomic read + val = flag; + } while (val < 1); + out = data; +#pragma omp barrier + } + } + // CHECK: out=42. + /* Value of out will be 42 */ + printf("out=%d.\n", out); + return !(out == 42); +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c index 412538b6dd156..b5fc059828f81 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/flush.c @@ -1,35 +1,35 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main(int argc, char *argv[]) { - int data, out, flag = 0; -#pragma omp target parallel num_threads(64) map(tofrom \ - : out, flag) map(to \ - : data) - { - if (omp_get_thread_num() == 0) { - /* Write to the data buffer that will be read by thread */ - data = 42; -/* Flush data to thread 32 */ -#pragma omp flush(data) - /* Set flag to release thread 32 */ -#pragma omp atomic write - flag = 1; - } else if (omp_get_thread_num() == 32) { - /* Loop until we see the update to the flag */ - int val; - do { -#pragma omp atomic read - val = flag; - } while (val < 1); - out = data; -#pragma omp flush(out) - } - } - // CHECK: out=42. - /* Value of out will be 42 */ - printf("out=%d.\n", out); - return !(out == 42); -} +// RUN: %compile-run-and-check + +#include +#include + +int main(int argc, char *argv[]) { + int data, out, flag = 0; +#pragma omp target parallel num_threads(64) map(tofrom \ + : out, flag) map(to \ + : data) + { + if (omp_get_thread_num() == 0) { + /* Write to the data buffer that will be read by thread */ + data = 42; +/* Flush data to thread 32 */ +#pragma omp flush(data) + /* Set flag to release thread 32 */ +#pragma omp atomic write + flag = 1; + } else if (omp_get_thread_num() == 32) { + /* Loop until we see the update to the flag */ + int val; + do { +#pragma omp atomic read + val = flag; + } while (val < 1); + out = data; +#pragma omp flush(out) + } + } + // CHECK: out=42. + /* Value of out will be 42 */ + printf("out=%d.\n", out); + return !(out == 42); +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c index 0a137530cef74..7b28c5f302082 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/level.c @@ -1,151 +1,151 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int MaxThreads = 1024; -const int NumThreads = 64; - -int main(int argc, char *argv[]) { - int level = -1, activeLevel = -1; - // The expected value is -1, initialize to different value. - int ancestorTNumNeg = 1, teamSizeNeg = 1; - int ancestorTNum0 = -1, teamSize0 = -1; - // The expected value is -1, initialize to different value. - int ancestorTNum1 = 1, teamSize1 = 1; - int check1[MaxThreads]; - int check2[MaxThreads]; - int check3[MaxThreads]; - int check4[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = check3[i] = check4[i] = 0; - } - - #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \ - map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \ - map(check1[:], check2[:], check3[:], check4[:]) - { - level = omp_get_level(); - activeLevel = omp_get_active_level(); - - // Expected to return -1. - ancestorTNumNeg = omp_get_ancestor_thread_num(-1); - teamSizeNeg = omp_get_team_size(-1); - - // Expected to return 0 and 1. - ancestorTNum0 = omp_get_ancestor_thread_num(0); - teamSize0 = omp_get_team_size(0); - - // Expected to return -1 because the requested level is larger than - // the nest level. - ancestorTNum1 = omp_get_ancestor_thread_num(1); - teamSize1 = omp_get_team_size(1); - - // Expecting active parallel region. - #pragma omp parallel num_threads(NumThreads) - { - int id = omp_get_thread_num(); - // Multiply return value of omp_get_level by 5 to avoid that this test - // passes if both API calls return wrong values. - check1[id] += omp_get_level() * 5 + omp_get_active_level(); - - // Expected to return 0 and 1. - check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); - // Expected to return the current thread num. - check2[id] += (omp_get_ancestor_thread_num(1) - id); - // Expected to return the current number of threads. - check2[id] += 3 * omp_get_team_size(1); - // Expected to return -1, see above. - check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2); - - // Expecting serialized parallel region. - #pragma omp parallel - { - #pragma omp atomic - check3[id] += omp_get_level() * 5 + omp_get_active_level(); - - // Expected to return 0 and 1. - int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); - // Expected to return the parent thread num. - check4Inc += (omp_get_ancestor_thread_num(1) - id); - // Expected to return the number of threads in the active parallel region. - check4Inc += 3 * omp_get_team_size(1); - // Expected to return 0 and 1. - check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2); - // Expected to return -1, see above. - check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3); - - #pragma omp atomic - check4[id] += check4Inc; - } - } - } - - // CHECK: target: level = 0, activeLevel = 0 - printf("target: level = %d, activeLevel = %d\n", level, activeLevel); - // CHECK: level = -1: ancestorTNum = -1, teamSize = -1 - printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg); - // CHECK: level = 0: ancestorTNum = 0, teamSize = 1 - printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0); - // CHECK: level = 1: ancestorTNum = -1, teamSize = -1 - printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - // Check active parallel region: - // omp_get_level() = 1, omp_get_active_level() = 1 - const int Expected1 = 6; - if (i < NumThreads) { - if (check1[i] != Expected1) { - printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - // 5 * 1 + 3 * 64 - 1 - 1 (see above) - const int Expected2 = 195; - if (i < NumThreads) { - if (check2[i] != Expected2) { - printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - - // Check serialized parallel region: - // omp_get_level() = 2, omp_get_active_level() = 1 - const int Expected3 = 11; - if (i < NumThreads) { - if (check3[i] != Expected3) { - printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]); - } - } else if (check3[i] != 0) { - printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); - } - - // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above) - const int Expected4 = 198; - if (i < NumThreads) { - if (check4[i] != Expected4) { - printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]); - } - } else if (check4[i] != 0) { - printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); - } - } - - // Check for paraller level in non-SPMD kernels. - level = 0; - #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level) - for (int i=0; i<5032; i+=32) { - int ub = (i+32 > 5032) ? 5032 : i+32; - #pragma omp parallel for schedule(dynamic) - for (int j=i ; j < ub; j++) ; - level += omp_get_level(); - } - // CHECK: Integral level = 0. - printf("Integral level = %d.\n", level); - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +const int MaxThreads = 1024; +const int NumThreads = 64; + +int main(int argc, char *argv[]) { + int level = -1, activeLevel = -1; + // The expected value is -1, initialize to different value. + int ancestorTNumNeg = 1, teamSizeNeg = 1; + int ancestorTNum0 = -1, teamSize0 = -1; + // The expected value is -1, initialize to different value. + int ancestorTNum1 = 1, teamSize1 = 1; + int check1[MaxThreads]; + int check2[MaxThreads]; + int check3[MaxThreads]; + int check4[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = check3[i] = check4[i] = 0; + } + + #pragma omp target map(level, activeLevel, ancestorTNumNeg, teamSizeNeg) \ + map(ancestorTNum0, teamSize0, ancestorTNum1, teamSize1) \ + map(check1[:], check2[:], check3[:], check4[:]) + { + level = omp_get_level(); + activeLevel = omp_get_active_level(); + + // Expected to return -1. + ancestorTNumNeg = omp_get_ancestor_thread_num(-1); + teamSizeNeg = omp_get_team_size(-1); + + // Expected to return 0 and 1. + ancestorTNum0 = omp_get_ancestor_thread_num(0); + teamSize0 = omp_get_team_size(0); + + // Expected to return -1 because the requested level is larger than + // the nest level. + ancestorTNum1 = omp_get_ancestor_thread_num(1); + teamSize1 = omp_get_team_size(1); + + // Expecting active parallel region. + #pragma omp parallel num_threads(NumThreads) + { + int id = omp_get_thread_num(); + // Multiply return value of omp_get_level by 5 to avoid that this test + // passes if both API calls return wrong values. + check1[id] += omp_get_level() * 5 + omp_get_active_level(); + + // Expected to return 0 and 1. + check2[id] += omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); + // Expected to return the current thread num. + check2[id] += (omp_get_ancestor_thread_num(1) - id); + // Expected to return the current number of threads. + check2[id] += 3 * omp_get_team_size(1); + // Expected to return -1, see above. + check2[id] += omp_get_ancestor_thread_num(2) + omp_get_team_size(2); + + // Expecting serialized parallel region. + #pragma omp parallel + { + #pragma omp atomic + check3[id] += omp_get_level() * 5 + omp_get_active_level(); + + // Expected to return 0 and 1. + int check4Inc = omp_get_ancestor_thread_num(0) + 5 * omp_get_team_size(0); + // Expected to return the parent thread num. + check4Inc += (omp_get_ancestor_thread_num(1) - id); + // Expected to return the number of threads in the active parallel region. + check4Inc += 3 * omp_get_team_size(1); + // Expected to return 0 and 1. + check4Inc += omp_get_ancestor_thread_num(2) + 3 * omp_get_team_size(2); + // Expected to return -1, see above. + check4Inc += omp_get_ancestor_thread_num(3) + omp_get_team_size(3); + + #pragma omp atomic + check4[id] += check4Inc; + } + } + } + + // CHECK: target: level = 0, activeLevel = 0 + printf("target: level = %d, activeLevel = %d\n", level, activeLevel); + // CHECK: level = -1: ancestorTNum = -1, teamSize = -1 + printf("level = -1: ancestorTNum = %d, teamSize = %d\n", ancestorTNumNeg, teamSizeNeg); + // CHECK: level = 0: ancestorTNum = 0, teamSize = 1 + printf("level = 0: ancestorTNum = %d, teamSize = %d\n", ancestorTNum0, teamSize0); + // CHECK: level = 1: ancestorTNum = -1, teamSize = -1 + printf("level = 1: ancestorTNum = %d, teamSize = %d\n", ancestorTNum1, teamSize1); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + // Check active parallel region: + // omp_get_level() = 1, omp_get_active_level() = 1 + const int Expected1 = 6; + if (i < NumThreads) { + if (check1[i] != Expected1) { + printf("invalid: check1[%d] should be %d, is %d\n", i, Expected1, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + // 5 * 1 + 3 * 64 - 1 - 1 (see above) + const int Expected2 = 195; + if (i < NumThreads) { + if (check2[i] != Expected2) { + printf("invalid: check2[%d] should be %d, is %d\n", i, Expected2, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + + // Check serialized parallel region: + // omp_get_level() = 2, omp_get_active_level() = 1 + const int Expected3 = 11; + if (i < NumThreads) { + if (check3[i] != Expected3) { + printf("invalid: check3[%d] should be %d, is %d\n", i, Expected3, check3[i]); + } + } else if (check3[i] != 0) { + printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); + } + + // 5 * 1 + 3 * 64 + 3 * 1 - 1 - 1 (see above) + const int Expected4 = 198; + if (i < NumThreads) { + if (check4[i] != Expected4) { + printf("invalid: check4[%d] should be %d, is %d\n", i, Expected4, check4[i]); + } + } else if (check4[i] != 0) { + printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); + } + } + + // Check for paraller level in non-SPMD kernels. + level = 0; + #pragma omp target teams distribute num_teams(1) thread_limit(32) reduction(+:level) + for (int i=0; i<5032; i+=32) { + int ub = (i+32 > 5032) ? 5032 : i+32; + #pragma omp parallel for schedule(dynamic) + for (int j=i ; j < ub; j++) ; + level += omp_get_level(); + } + // CHECK: Integral level = 0. + printf("Integral level = %d.\n", level); + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c index 70ebb1da9592e..747054c80fe62 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/nested.c @@ -1,136 +1,136 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int MaxThreads = 1024; -const int NumThreads = 64; -const int NumThreads1 = 1; - -int main(int argc, char *argv[]) { - int inParallel = -1, numThreads = -1, threadNum = -1; - int check1[MaxThreads]; - int check2[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = 0; - } - -#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) - { - inParallel = omp_in_parallel(); - numThreads = omp_get_num_threads(); - threadNum = omp_get_thread_num(); - -// Expecting active parallel region. -#pragma omp parallel num_threads(NumThreads) - { - int id = omp_get_thread_num(); - check1[id] += omp_get_num_threads() + omp_in_parallel(); - -// Expecting serialized parallel region. -#pragma omp parallel - { - // Expected to be 1. - int nestedInParallel = omp_in_parallel(); - // Expected to be 1. - int nestedNumThreads = omp_get_num_threads(); - // Expected to be 0. - int nestedThreadNum = omp_get_thread_num(); -#pragma omp atomic - check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; - } - } - } - - // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 - printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", - inParallel, numThreads, threadNum); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - // Check that all threads reported - // omp_get_num_threads() = 64, omp_in_parallel() = 1. - int Expected = NumThreads + 1; - if (i < NumThreads) { - if (check1[i] != Expected) { - printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, - check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - // Check serialized parallel region. - if (i < NumThreads) { - if (check2[i] != 2) { - printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - } - - inParallel = -1; - numThreads = -1; - threadNum = -1; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = 0; - } - -#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) - { - inParallel = omp_in_parallel(); - numThreads = omp_get_num_threads(); - threadNum = omp_get_thread_num(); - -// Expecting active parallel region. -#pragma omp parallel num_threads(NumThreads1) - { - int id = omp_get_thread_num(); - check1[id] += omp_get_num_threads() + omp_in_parallel(); - -// Expecting serialized parallel region. -#pragma omp parallel - { - // Expected to be 0. - int nestedInParallel = omp_in_parallel(); - // Expected to be 1. - int nestedNumThreads = omp_get_num_threads(); - // Expected to be 0. - int nestedThreadNum = omp_get_thread_num(); -#pragma omp atomic - check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; - } - } - } - - // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 - printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", - inParallel, numThreads, threadNum); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - // Check that all threads reported - // omp_get_num_threads() = 1, omp_in_parallel() = 0. - int Expected = 1; - if (i < NumThreads1) { - if (check1[i] != Expected) { - printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, - check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - // Check serialized parallel region. - if (i < NumThreads1) { - if (check2[i] != 1) { - printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - } - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +const int MaxThreads = 1024; +const int NumThreads = 64; +const int NumThreads1 = 1; + +int main(int argc, char *argv[]) { + int inParallel = -1, numThreads = -1, threadNum = -1; + int check1[MaxThreads]; + int check2[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = 0; + } + +#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) + { + inParallel = omp_in_parallel(); + numThreads = omp_get_num_threads(); + threadNum = omp_get_thread_num(); + +// Expecting active parallel region. +#pragma omp parallel num_threads(NumThreads) + { + int id = omp_get_thread_num(); + check1[id] += omp_get_num_threads() + omp_in_parallel(); + +// Expecting serialized parallel region. +#pragma omp parallel + { + // Expected to be 1. + int nestedInParallel = omp_in_parallel(); + // Expected to be 1. + int nestedNumThreads = omp_get_num_threads(); + // Expected to be 0. + int nestedThreadNum = omp_get_thread_num(); +#pragma omp atomic + check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; + } + } + } + + // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 + printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", + inParallel, numThreads, threadNum); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + // Check that all threads reported + // omp_get_num_threads() = 64, omp_in_parallel() = 1. + int Expected = NumThreads + 1; + if (i < NumThreads) { + if (check1[i] != Expected) { + printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, + check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + // Check serialized parallel region. + if (i < NumThreads) { + if (check2[i] != 2) { + printf("invalid: check2[%d] should be 2, is %d\n", i, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + } + + inParallel = -1; + numThreads = -1; + threadNum = -1; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = 0; + } + +#pragma omp target map(inParallel, numThreads, threadNum, check1[:], check2[:]) + { + inParallel = omp_in_parallel(); + numThreads = omp_get_num_threads(); + threadNum = omp_get_thread_num(); + +// Expecting active parallel region. +#pragma omp parallel num_threads(NumThreads1) + { + int id = omp_get_thread_num(); + check1[id] += omp_get_num_threads() + omp_in_parallel(); + +// Expecting serialized parallel region. +#pragma omp parallel + { + // Expected to be 0. + int nestedInParallel = omp_in_parallel(); + // Expected to be 1. + int nestedNumThreads = omp_get_num_threads(); + // Expected to be 0. + int nestedThreadNum = omp_get_thread_num(); +#pragma omp atomic + check2[id] += nestedInParallel + nestedNumThreads + nestedThreadNum; + } + } + } + + // CHECK: target: inParallel = 0, numThreads = 1, threadNum = 0 + printf("target: inParallel = %d, numThreads = %d, threadNum = %d\n", + inParallel, numThreads, threadNum); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + // Check that all threads reported + // omp_get_num_threads() = 1, omp_in_parallel() = 0. + int Expected = 1; + if (i < NumThreads1) { + if (check1[i] != Expected) { + printf("invalid: check1[%d] should be %d, is %d\n", i, Expected, + check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + // Check serialized parallel region. + if (i < NumThreads1) { + if (check2[i] != 1) { + printf("invalid: check2[%d] should be 1, is %d\n", i, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + } + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c index 4a2f73fee827a..ea16056b1ce3c 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/num_threads.c @@ -1,102 +1,102 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int WarpSize = 32; -const int NumThreads1 = 1 * WarpSize; -const int NumThreads2 = 2 * WarpSize; -const int NumThreads3 = 3 * WarpSize; -const int MaxThreads = 1024; - -int main(int argc, char *argv[]) { - int check1[MaxThreads]; - int check2[MaxThreads]; - int check3[MaxThreads]; - int check4[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = check3[i] = check4[i] = 0; - } - - int maxThreads1 = -1; - int maxThreads2 = -1; - int maxThreads3 = -1; - - #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \ - map(maxThreads1, maxThreads2, maxThreads3) - { - #pragma omp parallel num_threads(NumThreads1) - { - check1[omp_get_thread_num()] += omp_get_num_threads(); - } - - // API method to set number of threads in parallel regions without - // num_threads() clause. - omp_set_num_threads(NumThreads2); - maxThreads1 = omp_get_max_threads(); - #pragma omp parallel - { - check2[omp_get_thread_num()] += omp_get_num_threads(); - } - - maxThreads2 = omp_get_max_threads(); - - // num_threads() clause should override nthreads-var ICV. - #pragma omp parallel num_threads(NumThreads3) - { - check3[omp_get_thread_num()] += omp_get_num_threads(); - } - - maxThreads3 = omp_get_max_threads(); - - // Effect from omp_set_num_threads() should still be visible. - #pragma omp parallel - { - check4[omp_get_thread_num()] += omp_get_num_threads(); - } - } - - // CHECK: maxThreads1 = 64 - printf("maxThreads1 = %d\n", maxThreads1); - // CHECK: maxThreads2 = 64 - printf("maxThreads2 = %d\n", maxThreads2); - // CHECK: maxThreads3 = 64 - printf("maxThreads3 = %d\n", maxThreads3); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - if (i < NumThreads1) { - if (check1[i] != NumThreads1) { - printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - if (i < NumThreads2) { - if (check2[i] != NumThreads2) { - printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - - if (i < NumThreads3) { - if (check3[i] != NumThreads3) { - printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]); - } - } else if (check3[i] != 0) { - printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); - } - - if (i < NumThreads2) { - if (check4[i] != NumThreads2) { - printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]); - } - } else if (check4[i] != 0) { - printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); - } - } - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +const int WarpSize = 32; +const int NumThreads1 = 1 * WarpSize; +const int NumThreads2 = 2 * WarpSize; +const int NumThreads3 = 3 * WarpSize; +const int MaxThreads = 1024; + +int main(int argc, char *argv[]) { + int check1[MaxThreads]; + int check2[MaxThreads]; + int check3[MaxThreads]; + int check4[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = check3[i] = check4[i] = 0; + } + + int maxThreads1 = -1; + int maxThreads2 = -1; + int maxThreads3 = -1; + + #pragma omp target map(check1[:], check2[:], check3[:], check4[:]) \ + map(maxThreads1, maxThreads2, maxThreads3) + { + #pragma omp parallel num_threads(NumThreads1) + { + check1[omp_get_thread_num()] += omp_get_num_threads(); + } + + // API method to set number of threads in parallel regions without + // num_threads() clause. + omp_set_num_threads(NumThreads2); + maxThreads1 = omp_get_max_threads(); + #pragma omp parallel + { + check2[omp_get_thread_num()] += omp_get_num_threads(); + } + + maxThreads2 = omp_get_max_threads(); + + // num_threads() clause should override nthreads-var ICV. + #pragma omp parallel num_threads(NumThreads3) + { + check3[omp_get_thread_num()] += omp_get_num_threads(); + } + + maxThreads3 = omp_get_max_threads(); + + // Effect from omp_set_num_threads() should still be visible. + #pragma omp parallel + { + check4[omp_get_thread_num()] += omp_get_num_threads(); + } + } + + // CHECK: maxThreads1 = 64 + printf("maxThreads1 = %d\n", maxThreads1); + // CHECK: maxThreads2 = 64 + printf("maxThreads2 = %d\n", maxThreads2); + // CHECK: maxThreads3 = 64 + printf("maxThreads3 = %d\n", maxThreads3); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + if (i < NumThreads1) { + if (check1[i] != NumThreads1) { + printf("invalid: check1[%d] should be %d, is %d\n", i, NumThreads1, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + if (i < NumThreads2) { + if (check2[i] != NumThreads2) { + printf("invalid: check2[%d] should be %d, is %d\n", i, NumThreads2, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + + if (i < NumThreads3) { + if (check3[i] != NumThreads3) { + printf("invalid: check3[%d] should be %d, is %d\n", i, NumThreads3, check3[i]); + } + } else if (check3[i] != 0) { + printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); + } + + if (i < NumThreads2) { + if (check4[i] != NumThreads2) { + printf("invalid: check4[%d] should be %d, is %d\n", i, NumThreads2, check4[i]); + } + } else if (check4[i] != 0) { + printf("invalid: check4[%d] should be 0, is %d\n", i, check4[i]); + } + } + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp index 517db59f64ae3..2339c4a589c14 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/spmd_parallel_regions.cpp @@ -1,51 +1,51 @@ -// RUN: %compilexx-run-and-check - -#include -#include - -int main(void) { - int isHost = -1; - int ParallelLevel1 = -1, ParallelLevel2 = -1; - int Count = 0; - -#pragma omp target parallel for map(tofrom \ - : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1) - for (int J = 0; J < 10; ++J) { -#pragma omp critical - { - isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost; - ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1) - ? omp_get_level() - : ParallelLevel1; - } - if (omp_get_thread_num() > 5) { - int L2; -#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count) - for (int I = 0; I < 10; ++I) { - L2 = omp_get_level(); - Count += omp_get_level(); // (10-6)*10*2 = 80 - } -#pragma omp critical - ParallelLevel2 = - (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2; - } else { - Count += omp_get_level(); // 6 * 1 = 6 - } - } - - if (isHost < 0) { - printf("Runtime error, isHost=%d\n", isHost); - } - - // CHECK: Target region executed on the device - printf("Target region executed on the %s\n", isHost ? "host" : "device"); - // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2 - printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1, - ParallelLevel2); - // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par - // level) + 6(num of iterations) * 1(par level) - // CHECK: Expected count = 86 - printf("Expected count = %d\n", Count); - - return isHost; -} +// RUN: %compilexx-run-and-check + +#include +#include + +int main(void) { + int isHost = -1; + int ParallelLevel1 = -1, ParallelLevel2 = -1; + int Count = 0; + +#pragma omp target parallel for map(tofrom \ + : isHost, ParallelLevel1, ParallelLevel2), reduction(+: Count) schedule(static, 1) + for (int J = 0; J < 10; ++J) { +#pragma omp critical + { + isHost = (isHost < 0 || isHost == 0) ? omp_is_initial_device() : isHost; + ParallelLevel1 = (ParallelLevel1 < 0 || ParallelLevel1 == 1) + ? omp_get_level() + : ParallelLevel1; + } + if (omp_get_thread_num() > 5) { + int L2; +#pragma omp parallel for schedule(dynamic) lastprivate(L2) reduction(+: Count) + for (int I = 0; I < 10; ++I) { + L2 = omp_get_level(); + Count += omp_get_level(); // (10-6)*10*2 = 80 + } +#pragma omp critical + ParallelLevel2 = + (ParallelLevel2 < 0 || ParallelLevel2 == 2) ? L2 : ParallelLevel2; + } else { + Count += omp_get_level(); // 6 * 1 = 6 + } + } + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + // CHECK: Parallel level in SPMD mode: L1 is 1, L2 is 2 + printf("Parallel level in SPMD mode: L1 is %d, L2 is %d\n", ParallelLevel1, + ParallelLevel2); + // Final result of Count is (10-6)(num of loops)*10(num of iterations)*2(par + // level) + 6(num of iterations) * 1(par level) + // CHECK: Expected count = 86 + printf("Expected count = %d\n", Count); + + return isHost; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c index 5e40bb564aa0f..858edd1cc8625 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/thread_limit.c @@ -1,77 +1,77 @@ -// RUN: %compile-run-and-check - -#include -#include - -const int WarpSize = 32; -const int ThreadLimit = 1 * WarpSize; -const int NumThreads2 = 2 * WarpSize; -const int NumThreads3 = 3 * WarpSize; -const int MaxThreads = 1024; - -int main(int argc, char *argv[]) { - int check1[MaxThreads]; - int check2[MaxThreads]; - int check3[MaxThreads]; - for (int i = 0; i < MaxThreads; i++) { - check1[i] = check2[i] = check3[i] = 0; - } - - int threadLimit = -1; - - #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \ - map(check1[:], check2[:], check3[:], threadLimit) - { - threadLimit = omp_get_thread_limit(); - - // All parallel regions should get as many threads as specified by the - // thread_limit() clause. - #pragma omp parallel - { - check1[omp_get_thread_num()] += omp_get_num_threads(); - } - - omp_set_num_threads(NumThreads2); - #pragma omp parallel - { - check2[omp_get_thread_num()] += omp_get_num_threads(); - } - - #pragma omp parallel num_threads(NumThreads3) - { - check3[omp_get_thread_num()] += omp_get_num_threads(); - } - } - - // CHECK: threadLimit = 32 - printf("threadLimit = %d\n", threadLimit); - - // CHECK-NOT: invalid - for (int i = 0; i < MaxThreads; i++) { - if (i < ThreadLimit) { - if (check1[i] != ThreadLimit) { - printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]); - } - } else if (check1[i] != 0) { - printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); - } - - if (i < ThreadLimit) { - if (check2[i] != ThreadLimit) { - printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]); - } - } else if (check2[i] != 0) { - printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); - } - - if (i < ThreadLimit) { - if (check3[i] != ThreadLimit) { - printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]); - } - } else if (check3[i] != 0) { - printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); - } - } - - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +const int WarpSize = 32; +const int ThreadLimit = 1 * WarpSize; +const int NumThreads2 = 2 * WarpSize; +const int NumThreads3 = 3 * WarpSize; +const int MaxThreads = 1024; + +int main(int argc, char *argv[]) { + int check1[MaxThreads]; + int check2[MaxThreads]; + int check3[MaxThreads]; + for (int i = 0; i < MaxThreads; i++) { + check1[i] = check2[i] = check3[i] = 0; + } + + int threadLimit = -1; + + #pragma omp target teams num_teams(1) thread_limit(ThreadLimit) \ + map(check1[:], check2[:], check3[:], threadLimit) + { + threadLimit = omp_get_thread_limit(); + + // All parallel regions should get as many threads as specified by the + // thread_limit() clause. + #pragma omp parallel + { + check1[omp_get_thread_num()] += omp_get_num_threads(); + } + + omp_set_num_threads(NumThreads2); + #pragma omp parallel + { + check2[omp_get_thread_num()] += omp_get_num_threads(); + } + + #pragma omp parallel num_threads(NumThreads3) + { + check3[omp_get_thread_num()] += omp_get_num_threads(); + } + } + + // CHECK: threadLimit = 32 + printf("threadLimit = %d\n", threadLimit); + + // CHECK-NOT: invalid + for (int i = 0; i < MaxThreads; i++) { + if (i < ThreadLimit) { + if (check1[i] != ThreadLimit) { + printf("invalid: check1[%d] should be %d, is %d\n", i, ThreadLimit, check1[i]); + } + } else if (check1[i] != 0) { + printf("invalid: check1[%d] should be 0, is %d\n", i, check1[i]); + } + + if (i < ThreadLimit) { + if (check2[i] != ThreadLimit) { + printf("invalid: check2[%d] should be %d, is %d\n", i, ThreadLimit, check2[i]); + } + } else if (check2[i] != 0) { + printf("invalid: check2[%d] should be 0, is %d\n", i, check2[i]); + } + + if (i < ThreadLimit) { + if (check3[i] != ThreadLimit) { + printf("invalid: check3[%d] should be %d, is %d\n", i, ThreadLimit, check3[i]); + } + } else if (check3[i] != 0) { + printf("invalid: check3[%d] should be 0, is %d\n", i, check3[i]); + } + } + + return 0; +} diff --git a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c index b3f8768564080..ef0958070c857 100644 --- a/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c +++ b/openmp/libomptarget/deviceRTLs/nvptx/test/parallel/tripcount.c @@ -1,22 +1,22 @@ -// RUN: %compile-run-and-check - -#include -#include - -int main() { - int res = 0; - -#pragma omp parallel num_threads(2) reduction(+:res) - { - int tid = omp_get_thread_num(); -#pragma omp target teams distribute reduction(+:res) - for (int i = tid; i < 2; i++) - ++res; - } - // The first thread makes 2 iterations, the second - 1. Expected result of the - // reduction res is 3. - - // CHECK: res = 3. - printf("res = %d.\n", res); - return 0; -} +// RUN: %compile-run-and-check + +#include +#include + +int main() { + int res = 0; + +#pragma omp parallel num_threads(2) reduction(+:res) + { + int tid = omp_get_thread_num(); +#pragma omp target teams distribute reduction(+:res) + for (int i = tid; i < 2; i++) + ++res; + } + // The first thread makes 2 iterations, the second - 1. Expected result of the + // reduction res is 3. + + // CHECK: res = 3. + printf("res = %d.\n", res); + return 0; +} diff --git a/openmp/libomptarget/include/omptarget.h b/openmp/libomptarget/include/omptarget.h index de3afc36c7f28..6cdd73c4c9fb5 100644 --- a/openmp/libomptarget/include/omptarget.h +++ b/openmp/libomptarget/include/omptarget.h @@ -1,261 +1,261 @@ -//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Interface to be used by Clang during the codegen of a -// target region. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_H_ -#define _OMPTARGET_H_ - -#include -#include - -#define OFFLOAD_SUCCESS (0) -#define OFFLOAD_FAIL (~0) - -#define OFFLOAD_DEVICE_DEFAULT -1 -#define HOST_DEVICE -10 - -/// Data attributes for each data reference used in an OpenMP target region. -enum tgt_map_type { - // No flags - OMP_TGT_MAPTYPE_NONE = 0x000, - // copy data from host to device - OMP_TGT_MAPTYPE_TO = 0x001, - // copy data from device to host - OMP_TGT_MAPTYPE_FROM = 0x002, - // copy regardless of the reference count - OMP_TGT_MAPTYPE_ALWAYS = 0x004, - // force unmapping of data - OMP_TGT_MAPTYPE_DELETE = 0x008, - // map the pointer as well as the pointee - OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, - // pass device base address to kernel - OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, - // return base device address of mapped data - OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, - // private variable - not mapped - OMP_TGT_MAPTYPE_PRIVATE = 0x080, - // copy by value - not mapped - OMP_TGT_MAPTYPE_LITERAL = 0x100, - // mapping is implicit - OMP_TGT_MAPTYPE_IMPLICIT = 0x200, - // copy data to device - OMP_TGT_MAPTYPE_CLOSE = 0x400, - // member of struct, member given by [16 MSBs] - 1 - OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 -}; - -enum OpenMPOffloadingDeclareTargetFlags { - /// Mark the entry as having a 'link' attribute. - OMP_DECLARE_TARGET_LINK = 0x01, - /// Mark the entry as being a global constructor. - OMP_DECLARE_TARGET_CTOR = 0x02, - /// Mark the entry as being a global destructor. - OMP_DECLARE_TARGET_DTOR = 0x04 -}; - -enum OpenMPOffloadingRequiresDirFlags { - /// flag undefined. - OMP_REQ_UNDEFINED = 0x000, - /// no requires directive present. - OMP_REQ_NONE = 0x001, - /// reverse_offload clause. - OMP_REQ_REVERSE_OFFLOAD = 0x002, - /// unified_address clause. - OMP_REQ_UNIFIED_ADDRESS = 0x004, - /// unified_shared_memory clause. - OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008, - /// dynamic_allocators clause. - OMP_REQ_DYNAMIC_ALLOCATORS = 0x010 -}; - -/// This struct is a record of an entry point or global. For a function -/// entry point the size is expected to be zero -struct __tgt_offload_entry { - void *addr; // Pointer to the offload entry info (function or global) - char *name; // Name of the function or global - size_t size; // Size of the entry info (0 if it is a function) - int32_t flags; // Flags associated with the entry, e.g. 'link'. - int32_t reserved; // Reserved, to be used by the runtime library. -}; - -/// This struct is a record of the device image information -struct __tgt_device_image { - void *ImageStart; // Pointer to the target code start - void *ImageEnd; // Pointer to the target code end - __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries - __tgt_offload_entry *EntriesEnd; // End of table (non inclusive) -}; - -/// This struct is a record of all the host code that may be offloaded to a -/// target. -struct __tgt_bin_desc { - int32_t NumDeviceImages; // Number of device types supported - __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type) - __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries - __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) -}; - -/// This struct contains the offload entries identified by the target runtime -struct __tgt_target_table { - __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries - __tgt_offload_entry - *EntriesEnd; // End of the table with all the entries (non inclusive) -}; - -/// This struct contains information exchanged between different asynchronous -/// operations for device-dependent optimization and potential synchronization -struct __tgt_async_info { - // A pointer to a queue-like structure where offloading operations are issued. - // We assume to use this structure to do synchronization. In CUDA backend, it - // is CUstream. - void *Queue = nullptr; -}; - -#ifdef __cplusplus -extern "C" { -#endif - -int omp_get_num_devices(void); -int omp_get_initial_device(void); -void *omp_target_alloc(size_t size, int device_num); -void omp_target_free(void *device_ptr, int device_num); -int omp_target_is_present(void *ptr, int device_num); -int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset, - size_t src_offset, int dst_device, int src_device); -int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, - int num_dims, const size_t *volume, const size_t *dst_offsets, - const size_t *src_offsets, const size_t *dst_dimensions, - const size_t *src_dimensions, int dst_device, int src_device); -int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, - size_t device_offset, int device_num); -int omp_target_disassociate_ptr(void *host_ptr, int device_num); - -/// add the clauses of the requires directives in a given file -void __tgt_register_requires(int64_t flags); - -/// adds a target shared library to the target execution image -void __tgt_register_lib(__tgt_bin_desc *desc); - -/// removes a target shared library from the target execution image -void __tgt_unregister_lib(__tgt_bin_desc *desc); - -// creates the host to target data mapping, stores it in the -// libomptarget.so internal structure (an entry in a stack of data maps) and -// passes the data to the device; -void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types); -void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, - void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, - int32_t depNum, void *depList, - int32_t noAliasDepNum, - void *noAliasDepList); - -// passes data from the target, release target memory and destroys the -// host-target mapping (top entry from the stack of data maps) created by -// the last __tgt_target_data_begin -void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types); -void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num, - void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, - int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList); - -/// passes data to/from the target -void __tgt_target_data_update(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types); -void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num, - void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, - int32_t depNum, void *depList, - int32_t noAliasDepNum, - void *noAliasDepList); - -// Performs the same actions as data_begin in case arg_num is non-zero -// and initiates run of offloaded region on target platform; if arg_num -// is non-zero after the region execution is done it also performs the -// same action as data_end above. The following types are used; this -// function returns 0 if it was able to transfer the execution to a -// target and an int different from zero otherwise. -int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types); -int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList); - -int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, int32_t num_teams, - int32_t thread_limit); -int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, - int32_t num_teams, int32_t thread_limit, - int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList); -void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount); - -#ifdef __cplusplus -} -#endif - -#ifdef OMPTARGET_DEBUG -#include -#define DEBUGP(prefix, ...) \ - { \ - fprintf(stderr, "%s --> ", prefix); \ - fprintf(stderr, __VA_ARGS__); \ - } - -#ifndef __STDC_FORMAT_MACROS -#define __STDC_FORMAT_MACROS -#endif - -#include -#define DPxMOD "0x%0*" PRIxPTR -#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr)) - -/* - * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x, - * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr)); - * - * DPxMOD expands to: - * "0x%0*" PRIxPTR - * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a - * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long: - * "0x%0*lu" - * - * Ultimately, the whole statement expands to: - * printf("ptr=0x%0*lu...\n", // the 0* modifier expects an extra argument - * // specifying the width of the output - * (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width - * // 8 digits for 32bit systems - * // 16 digits for 64bit - * (uintptr_t) ptr); - */ -#else -#define DEBUGP(prefix, ...) \ - {} -#endif - -#ifdef __cplusplus -#define EXTERN extern "C" -#else -#define EXTERN extern -#endif - -#endif // _OMPTARGET_H_ +//===-------- omptarget.h - Target independent OpenMP target RTL -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_H_ +#define _OMPTARGET_H_ + +#include +#include + +#define OFFLOAD_SUCCESS (0) +#define OFFLOAD_FAIL (~0) + +#define OFFLOAD_DEVICE_DEFAULT -1 +#define HOST_DEVICE -10 + +/// Data attributes for each data reference used in an OpenMP target region. +enum tgt_map_type { + // No flags + OMP_TGT_MAPTYPE_NONE = 0x000, + // copy data from host to device + OMP_TGT_MAPTYPE_TO = 0x001, + // copy data from device to host + OMP_TGT_MAPTYPE_FROM = 0x002, + // copy regardless of the reference count + OMP_TGT_MAPTYPE_ALWAYS = 0x004, + // force unmapping of data + OMP_TGT_MAPTYPE_DELETE = 0x008, + // map the pointer as well as the pointee + OMP_TGT_MAPTYPE_PTR_AND_OBJ = 0x010, + // pass device base address to kernel + OMP_TGT_MAPTYPE_TARGET_PARAM = 0x020, + // return base device address of mapped data + OMP_TGT_MAPTYPE_RETURN_PARAM = 0x040, + // private variable - not mapped + OMP_TGT_MAPTYPE_PRIVATE = 0x080, + // copy by value - not mapped + OMP_TGT_MAPTYPE_LITERAL = 0x100, + // mapping is implicit + OMP_TGT_MAPTYPE_IMPLICIT = 0x200, + // copy data to device + OMP_TGT_MAPTYPE_CLOSE = 0x400, + // member of struct, member given by [16 MSBs] - 1 + OMP_TGT_MAPTYPE_MEMBER_OF = 0xffff000000000000 +}; + +enum OpenMPOffloadingDeclareTargetFlags { + /// Mark the entry as having a 'link' attribute. + OMP_DECLARE_TARGET_LINK = 0x01, + /// Mark the entry as being a global constructor. + OMP_DECLARE_TARGET_CTOR = 0x02, + /// Mark the entry as being a global destructor. + OMP_DECLARE_TARGET_DTOR = 0x04 +}; + +enum OpenMPOffloadingRequiresDirFlags { + /// flag undefined. + OMP_REQ_UNDEFINED = 0x000, + /// no requires directive present. + OMP_REQ_NONE = 0x001, + /// reverse_offload clause. + OMP_REQ_REVERSE_OFFLOAD = 0x002, + /// unified_address clause. + OMP_REQ_UNIFIED_ADDRESS = 0x004, + /// unified_shared_memory clause. + OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008, + /// dynamic_allocators clause. + OMP_REQ_DYNAMIC_ALLOCATORS = 0x010 +}; + +/// This struct is a record of an entry point or global. For a function +/// entry point the size is expected to be zero +struct __tgt_offload_entry { + void *addr; // Pointer to the offload entry info (function or global) + char *name; // Name of the function or global + size_t size; // Size of the entry info (0 if it is a function) + int32_t flags; // Flags associated with the entry, e.g. 'link'. + int32_t reserved; // Reserved, to be used by the runtime library. +}; + +/// This struct is a record of the device image information +struct __tgt_device_image { + void *ImageStart; // Pointer to the target code start + void *ImageEnd; // Pointer to the target code end + __tgt_offload_entry *EntriesBegin; // Begin of table with all target entries + __tgt_offload_entry *EntriesEnd; // End of table (non inclusive) +}; + +/// This struct is a record of all the host code that may be offloaded to a +/// target. +struct __tgt_bin_desc { + int32_t NumDeviceImages; // Number of device types supported + __tgt_device_image *DeviceImages; // Array of device images (1 per dev. type) + __tgt_offload_entry *HostEntriesBegin; // Begin of table with all host entries + __tgt_offload_entry *HostEntriesEnd; // End of table (non inclusive) +}; + +/// This struct contains the offload entries identified by the target runtime +struct __tgt_target_table { + __tgt_offload_entry *EntriesBegin; // Begin of the table with all the entries + __tgt_offload_entry + *EntriesEnd; // End of the table with all the entries (non inclusive) +}; + +/// This struct contains information exchanged between different asynchronous +/// operations for device-dependent optimization and potential synchronization +struct __tgt_async_info { + // A pointer to a queue-like structure where offloading operations are issued. + // We assume to use this structure to do synchronization. In CUDA backend, it + // is CUstream. + void *Queue = nullptr; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +int omp_get_num_devices(void); +int omp_get_initial_device(void); +void *omp_target_alloc(size_t size, int device_num); +void omp_target_free(void *device_ptr, int device_num); +int omp_target_is_present(void *ptr, int device_num); +int omp_target_memcpy(void *dst, void *src, size_t length, size_t dst_offset, + size_t src_offset, int dst_device, int src_device); +int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, const size_t *dst_offsets, + const size_t *src_offsets, const size_t *dst_dimensions, + const size_t *src_dimensions, int dst_device, int src_device); +int omp_target_associate_ptr(void *host_ptr, void *device_ptr, size_t size, + size_t device_offset, int device_num); +int omp_target_disassociate_ptr(void *host_ptr, int device_num); + +/// add the clauses of the requires directives in a given file +void __tgt_register_requires(int64_t flags); + +/// adds a target shared library to the target execution image +void __tgt_register_lib(__tgt_bin_desc *desc); + +/// removes a target shared library from the target execution image +void __tgt_unregister_lib(__tgt_bin_desc *desc); + +// creates the host to target data mapping, stores it in the +// libomptarget.so internal structure (an entry in a stack of data maps) and +// passes the data to the device; +void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); +void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + +// passes data from the target, release target memory and destroys the +// host-target mapping (top entry from the stack of data maps) created by +// the last __tgt_target_data_begin +void __tgt_target_data_end(int64_t device_id, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types); +void __tgt_target_data_end_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + +/// passes data to/from the target +void __tgt_target_data_update(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); +void __tgt_target_data_update_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, + int32_t noAliasDepNum, + void *noAliasDepList); + +// Performs the same actions as data_begin in case arg_num is non-zero +// and initiates run of offloaded region on target platform; if arg_num +// is non-zero after the region execution is done it also performs the +// same action as data_end above. The following types are used; this +// function returns 0 if it was able to transfer the execution to a +// target and an int different from zero otherwise. +int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types); +int __tgt_target_nowait(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); + +int __tgt_target_teams(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t num_teams, + int32_t thread_limit); +int __tgt_target_teams_nowait(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, + int32_t num_teams, int32_t thread_limit, + int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList); +void __kmpc_push_target_tripcount(int64_t device_id, uint64_t loop_tripcount); + +#ifdef __cplusplus +} +#endif + +#ifdef OMPTARGET_DEBUG +#include +#define DEBUGP(prefix, ...) \ + { \ + fprintf(stderr, "%s --> ", prefix); \ + fprintf(stderr, __VA_ARGS__); \ + } + +#ifndef __STDC_FORMAT_MACROS +#define __STDC_FORMAT_MACROS +#endif + +#include +#define DPxMOD "0x%0*" PRIxPTR +#define DPxPTR(ptr) ((int)(2*sizeof(uintptr_t))), ((uintptr_t) (ptr)) + +/* + * To printf a pointer in hex with a fixed width of 16 digits and a leading 0x, + * use printf("ptr=" DPxMOD "...\n", DPxPTR(ptr)); + * + * DPxMOD expands to: + * "0x%0*" PRIxPTR + * where PRIxPTR expands to an appropriate modifier for the type uintptr_t on a + * specific platform, e.g. "lu" if uintptr_t is typedef'd as unsigned long: + * "0x%0*lu" + * + * Ultimately, the whole statement expands to: + * printf("ptr=0x%0*lu...\n", // the 0* modifier expects an extra argument + * // specifying the width of the output + * (int)(2*sizeof(uintptr_t)), // the extra argument specifying the width + * // 8 digits for 32bit systems + * // 16 digits for 64bit + * (uintptr_t) ptr); + */ +#else +#define DEBUGP(prefix, ...) \ + {} +#endif + +#ifdef __cplusplus +#define EXTERN extern "C" +#else +#define EXTERN extern +#endif + +#endif // _OMPTARGET_H_ diff --git a/openmp/libomptarget/include/omptargetplugin.h b/openmp/libomptarget/include/omptargetplugin.h index 083e422aac163..366ad0161c99e 100644 --- a/openmp/libomptarget/include/omptargetplugin.h +++ b/openmp/libomptarget/include/omptargetplugin.h @@ -1,123 +1,133 @@ -//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines an interface between target independent OpenMP offload -// runtime library libomptarget and target dependent plugin. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGETPLUGIN_H_ -#define _OMPTARGETPLUGIN_H_ - -#include - -#ifdef __cplusplus -extern "C" { -#endif - -// Return the number of available devices of the type supported by the -// target RTL. -int32_t __tgt_rtl_number_of_devices(void); - -// Return an integer different from zero if the provided device image can be -// supported by the runtime. The functionality is similar to comparing the -// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a -// lightweight query to determine if the RTL is suitable for an image without -// having to load the library, which can be expensive. -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image); - -// Initialize the requires flags for the device. -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags); - -// Initialize the specified device. In case of success return 0; otherwise -// return an error code. -int32_t __tgt_rtl_init_device(int32_t ID); - -// Pass an executable image section described by image to the specified -// device and prepare an address table of target entities. In case of error, -// return NULL. Otherwise, return a pointer to the built address table. -// Individual entries in the table may also be NULL, when the corresponding -// offload region is not supported on the target device. -__tgt_target_table *__tgt_rtl_load_binary(int32_t ID, - __tgt_device_image *Image); - -// Allocate data on the particular target device, of the specified size. -// HostPtr is a address of the host data the allocated target data -// will be associated with (HostPtr may be NULL if it is not known at -// allocation time, like for example it would be for target data that -// is allocated by omp_target_alloc() API). Return address of the -// allocated data on the target that will be used by libomptarget.so to -// initialize the target data mapping structures. These addresses are -// used to generate a table of target variables to pass to -// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in -// case an error occurred on the target device. -void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); - -// Pass the data content to the target device using the target address. In case -// of success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size); - -int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, - int64_t Size, - __tgt_async_info *AsyncInfoPtr); - -// Retrieve the data content from the target device using its address. In case -// of success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, - int64_t Size); - -// Asynchronous version of __tgt_rtl_data_retrieve -int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, - void *TargetPtr, int64_t Size, - __tgt_async_info *AsyncInfoPtr); - -// De-allocate the data referenced by target ptr on the device. In case of -// success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); - -// Transfer control to the offloaded entry Entry on the target device. -// Args and Offsets are arrays of NumArgs size of target addresses and -// offsets. An offset should be added to the target address before passing it -// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is -// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be -// ignored on some platforms, like x86_64. In that case, it is synchronous. In -// case of success, return zero. Otherwise, return an error code. -int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, - ptrdiff_t *Offsets, int32_t NumArgs); - -// Asynchronous version of __tgt_rtl_run_target_region -int32_t __tgt_rtl_run_target_region_async(int32_t ID, void *Entry, void **Args, - ptrdiff_t *Offsets, int32_t NumArgs, - __tgt_async_info *AsyncInfoPtr); - -// Similar to __tgt_rtl_run_target_region, but additionally specify the -// number of teams to be created and a number of threads in each team. If -// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. -// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that -// case, it is synchronous. -int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, - ptrdiff_t *Offsets, int32_t NumArgs, - int32_t NumTeams, int32_t ThreadLimit, - uint64_t loop_tripcount); - -// Asynchronous version of __tgt_rtl_run_target_team_region -int32_t __tgt_rtl_run_target_team_region_async( - int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs, - int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount, - __tgt_async_info *AsyncInfoPtr); - -// Device synchronization. In case of success, return zero. Otherwise, return an -// error code. -int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr); - -#ifdef __cplusplus -} -#endif - -#endif // _OMPTARGETPLUGIN_H_ +//===-- omptargetplugin.h - Target dependent OpenMP Plugin API --*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines an interface between target independent OpenMP offload +// runtime library libomptarget and target dependent plugin. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGETPLUGIN_H_ +#define _OMPTARGETPLUGIN_H_ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Return the number of available devices of the type supported by the +// target RTL. +int32_t __tgt_rtl_number_of_devices(void); + +// Return an integer different from zero if the provided device image can be +// supported by the runtime. The functionality is similar to comparing the +// result of __tgt__rtl__load__binary to NULL. However, this is meant to be a +// lightweight query to determine if the RTL is suitable for an image without +// having to load the library, which can be expensive. +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *Image); + +// Initialize the requires flags for the device. +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags); + +// Initialize the specified device. In case of success return 0; otherwise +// return an error code. +int32_t __tgt_rtl_init_device(int32_t ID); + +// Pass an executable image section described by image to the specified +// device and prepare an address table of target entities. In case of error, +// return NULL. Otherwise, return a pointer to the built address table. +// Individual entries in the table may also be NULL, when the corresponding +// offload region is not supported on the target device. +__tgt_target_table *__tgt_rtl_load_binary(int32_t ID, + __tgt_device_image *Image); + +// Allocate data on the particular target device, of the specified size. +// HostPtr is a address of the host data the allocated target data +// will be associated with (HostPtr may be NULL if it is not known at +// allocation time, like for example it would be for target data that +// is allocated by omp_target_alloc() API). Return address of the +// allocated data on the target that will be used by libomptarget.so to +// initialize the target data mapping structures. These addresses are +// used to generate a table of target variables to pass to +// __tgt_rtl_run_region(). The __tgt_rtl_data_alloc() returns NULL in +// case an error occurred on the target device. +void *__tgt_rtl_data_alloc(int32_t ID, int64_t Size, void *HostPtr); + +// Pass the data content to the target device using the target address. In case +// of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_submit(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size); + +int32_t __tgt_rtl_data_submit_async(int32_t ID, void *TargetPtr, void *HostPtr, + int64_t Size, + __tgt_async_info *AsyncInfoPtr); + +// Retrieve the data content from the target device using its address. In case +// of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_retrieve(int32_t ID, void *HostPtr, void *TargetPtr, + int64_t Size); + +// Asynchronous version of __tgt_rtl_data_retrieve +int32_t __tgt_rtl_data_retrieve_async(int32_t ID, void *HostPtr, + void *TargetPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + +// Transfer the data content from one device to the other using address. In case +// of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_transfer(int32_t ID, void *DstPtr, void *SrcPtr, + int64_t Size); + +// Asynchronous version of __tgt_rtl_data_transfer +int32_t __tgt_rtl_data_transfer_async(int32_t ID, void *DstPtr, + void *SrcPtr, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + +// De-allocate the data referenced by target ptr on the device. In case of +// success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_data_delete(int32_t ID, void *TargetPtr); + +// Transfer control to the offloaded entry Entry on the target device. +// Args and Offsets are arrays of NumArgs size of target addresses and +// offsets. An offset should be added to the target address before passing it +// to the outlined function on device side. If AsyncInfoPtr is nullptr, it is +// synchronous; otherwise it is asynchronous. However, AsyncInfoPtr may be +// ignored on some platforms, like x86_64. In that case, it is synchronous. In +// case of success, return zero. Otherwise, return an error code. +int32_t __tgt_rtl_run_target_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs); + +// Asynchronous version of __tgt_rtl_run_target_region +int32_t __tgt_rtl_run_target_region_async(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs, + __tgt_async_info *AsyncInfoPtr); + +// Similar to __tgt_rtl_run_target_region, but additionally specify the +// number of teams to be created and a number of threads in each team. If +// AsyncInfoPtr is nullptr, it is synchronous; otherwise it is asynchronous. +// However, AsyncInfoPtr may be ignored on some platforms, like x86_64. In that +// case, it is synchronous. +int32_t __tgt_rtl_run_target_team_region(int32_t ID, void *Entry, void **Args, + ptrdiff_t *Offsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t loop_tripcount); + +// Asynchronous version of __tgt_rtl_run_target_team_region +int32_t __tgt_rtl_run_target_team_region_async( + int32_t ID, void *Entry, void **Args, ptrdiff_t *Offsets, int32_t NumArgs, + int32_t NumTeams, int32_t ThreadLimit, uint64_t loop_tripcount, + __tgt_async_info *AsyncInfoPtr); + +// Device synchronization. In case of success, return zero. Otherwise, return an +// error code. +int32_t __tgt_rtl_synchronize(int32_t ID, __tgt_async_info *AsyncInfoPtr); + +#ifdef __cplusplus +} +#endif + +#endif // _OMPTARGETPLUGIN_H_ diff --git a/openmp/libomptarget/plugins/CMakeLists.txt b/openmp/libomptarget/plugins/CMakeLists.txt index bb3f9c908087a..33c69e3f14b4c 100644 --- a/openmp/libomptarget/plugins/CMakeLists.txt +++ b/openmp/libomptarget/plugins/CMakeLists.txt @@ -1,77 +1,77 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build plugins for the user system if available. -# -##===----------------------------------------------------------------------===## - -# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); -# - build a plugin for an ELF based generic 64-bit target based on libffi. -# - tmachine: name of the machine processor as used in the cmake build system. -# - tmachine_name: name of the machine to be printed with the debug messages. -# - tmachine_libname: machine name to be appended to the plugin library name. -macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) -if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") - if(LIBOMPTARGET_DEP_LIBELF_FOUND) - if(LIBOMPTARGET_DEP_LIBFFI_FOUND) - - libomptarget_say("Building ${tmachine_name} offloading plugin.") - - include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) - include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) - - # Define macro to be used as prefix of the runtime messages for this target. - add_definitions("-DTARGET_NAME=${tmachine_name}") - - # Define macro with the ELF ID for this target. - add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") - - add_library("omptarget.rtl.${tmachine_libname}" SHARED - ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp) - - # Install plugin under the lib destination folder. - install(TARGETS "omptarget.rtl.${tmachine_libname}" - LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") - - target_link_libraries( - "omptarget.rtl.${tmachine_libname}" - ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} - ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} - dl - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") - - list(APPEND LIBOMPTARGET_TESTED_PLUGINS - "omptarget.rtl.${tmachine_libname}") - - # Report to the parent scope that we are building a plugin. - set(LIBOMPTARGET_SYSTEM_TARGETS - "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) - set(LIBOMPTARGET_TESTED_PLUGINS - "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) - - else(LIBOMPTARGET_DEP_LIBFFI_FOUND) - libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") - endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) - else(LIBOMPTARGET_DEP_LIBELF_FOUND) - libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") - endif(LIBOMPTARGET_DEP_LIBELF_FOUND) -else() - libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") -endif() -endmacro() - -add_subdirectory(aarch64) -add_subdirectory(cuda) -add_subdirectory(ppc64) -add_subdirectory(ppc64le) -add_subdirectory(x86_64) - -# Make sure the parent scope can see the plugins that will be created. -set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) -set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) - +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build plugins for the user system if available. +# +##===----------------------------------------------------------------------===## + +# void build_generic_elf64(string tmachine, string tmachine_name, string tmachine_libname, string elf_machine_id); +# - build a plugin for an ELF based generic 64-bit target based on libffi. +# - tmachine: name of the machine processor as used in the cmake build system. +# - tmachine_name: name of the machine to be printed with the debug messages. +# - tmachine_libname: machine name to be appended to the plugin library name. +macro(build_generic_elf64 tmachine tmachine_name tmachine_libname tmachine_triple elf_machine_id) +if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$") + if(LIBOMPTARGET_DEP_LIBELF_FOUND) + if(LIBOMPTARGET_DEP_LIBFFI_FOUND) + + libomptarget_say("Building ${tmachine_name} offloading plugin.") + + include_directories(${LIBOMPTARGET_DEP_LIBFFI_INCLUDE_DIR}) + include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIR}) + + # Define macro to be used as prefix of the runtime messages for this target. + add_definitions("-DTARGET_NAME=${tmachine_name}") + + # Define macro with the ELF ID for this target. + add_definitions("-DTARGET_ELF_ID=${elf_machine_id}") + + add_library("omptarget.rtl.${tmachine_libname}" SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/../generic-elf-64bit/src/rtl.cpp) + + # Install plugin under the lib destination folder. + install(TARGETS "omptarget.rtl.${tmachine_libname}" + LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + + target_link_libraries( + "omptarget.rtl.${tmachine_libname}" + ${LIBOMPTARGET_DEP_LIBFFI_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + dl + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + + list(APPEND LIBOMPTARGET_TESTED_PLUGINS + "omptarget.rtl.${tmachine_libname}") + + # Report to the parent scope that we are building a plugin. + set(LIBOMPTARGET_SYSTEM_TARGETS + "${LIBOMPTARGET_SYSTEM_TARGETS} ${tmachine_triple}" PARENT_SCOPE) + set(LIBOMPTARGET_TESTED_PLUGINS + "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + + else(LIBOMPTARGET_DEP_LIBFFI_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libffi dependency not found.") + endif(LIBOMPTARGET_DEP_LIBFFI_FOUND) + else(LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building ${tmachine_name} offloading plugin: libelf dependency not found.") + endif(LIBOMPTARGET_DEP_LIBELF_FOUND) +else() + libomptarget_say("Not building ${tmachine_name} offloading plugin: machine not found in the system.") +endif() +endmacro() + +add_subdirectory(aarch64) +add_subdirectory(cuda) +add_subdirectory(ppc64) +add_subdirectory(ppc64le) +add_subdirectory(x86_64) + +# Make sure the parent scope can see the plugins that will be created. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS}" PARENT_SCOPE) +set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE) + diff --git a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt index 350a56cb9a493..84d9be9dbc2e2 100644 --- a/openmp/libomptarget/plugins/aarch64/CMakeLists.txt +++ b/openmp/libomptarget/plugins/aarch64/CMakeLists.txt @@ -1,17 +1,17 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for an aarch64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") -else() - libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.") -endif() +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for an aarch64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("aarch64" "aarch64" "aarch64" "aarch64-unknown-linux-gnu" "183") +else() + libomptarget_say("Not building aarch64 offloading plugin: machine not found in the system.") +endif() diff --git a/openmp/libomptarget/plugins/common/elf_common.c b/openmp/libomptarget/plugins/common/elf_common.c index b0efd1abc1489..b912bbf11a27e 100644 --- a/openmp/libomptarget/plugins/common/elf_common.c +++ b/openmp/libomptarget/plugins/common/elf_common.c @@ -1,73 +1,73 @@ -//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Common ELF functionality for target plugins. -// Must be included in the plugin source file AFTER omptarget.h has been -// included and macro DP(...) has been defined. -// . -// -//===----------------------------------------------------------------------===// - -#if !(defined(_OMPTARGET_H_) && defined(DP)) -#error Include elf_common.c in the plugin source AFTER omptarget.h has been\ - included and macro DP(...) has been defined. -#endif - -#include -#include - -// Check whether an image is valid for execution on target_id -static inline int32_t elf_check_machine(__tgt_device_image *image, - uint16_t target_id) { - - // Is the library version incompatible with the header file? - if (elf_version(EV_CURRENT) == EV_NONE) { - DP("Incompatible ELF library!\n"); - return 0; - } - - char *img_begin = (char *)image->ImageStart; - char *img_end = (char *)image->ImageEnd; - size_t img_size = img_end - img_begin; - - // Obtain elf handler - Elf *e = elf_memory(img_begin, img_size); - if (!e) { - DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); - return 0; - } - - // Check if ELF is the right kind. - if (elf_kind(e) != ELF_K_ELF) { - DP("Unexpected ELF type!\n"); - elf_end(e); - return 0; - } - Elf64_Ehdr *eh64 = elf64_getehdr(e); - Elf32_Ehdr *eh32 = elf32_getehdr(e); - - if (!eh64 && !eh32) { - DP("Unable to get machine ID from ELF file!\n"); - elf_end(e); - return 0; - } - - uint16_t MachineID; - if (eh64 && !eh32) - MachineID = eh64->e_machine; - else if (eh32 && !eh64) - MachineID = eh32->e_machine; - else { - DP("Ambiguous ELF header!\n"); - elf_end(e); - return 0; - } - - elf_end(e); - return MachineID == target_id; -} +//===-- elf_common.c - Common ELF functionality -------------------*- C -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Common ELF functionality for target plugins. +// Must be included in the plugin source file AFTER omptarget.h has been +// included and macro DP(...) has been defined. +// . +// +//===----------------------------------------------------------------------===// + +#if !(defined(_OMPTARGET_H_) && defined(DP)) +#error Include elf_common.c in the plugin source AFTER omptarget.h has been\ + included and macro DP(...) has been defined. +#endif + +#include +#include + +// Check whether an image is valid for execution on target_id +static inline int32_t elf_check_machine(__tgt_device_image *image, + uint16_t target_id) { + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return 0; + } + + char *img_begin = (char *)image->ImageStart; + char *img_end = (char *)image->ImageEnd; + size_t img_size = img_end - img_begin; + + // Obtain elf handler + Elf *e = elf_memory(img_begin, img_size); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return 0; + } + + // Check if ELF is the right kind. + if (elf_kind(e) != ELF_K_ELF) { + DP("Unexpected ELF type!\n"); + elf_end(e); + return 0; + } + Elf64_Ehdr *eh64 = elf64_getehdr(e); + Elf32_Ehdr *eh32 = elf32_getehdr(e); + + if (!eh64 && !eh32) { + DP("Unable to get machine ID from ELF file!\n"); + elf_end(e); + return 0; + } + + uint16_t MachineID; + if (eh64 && !eh32) + MachineID = eh64->e_machine; + else if (eh32 && !eh64) + MachineID = eh32->e_machine; + else { + DP("Ambiguous ELF header!\n"); + elf_end(e); + return 0; + } + + elf_end(e); + return MachineID == target_id; +} diff --git a/openmp/libomptarget/plugins/cuda/CMakeLists.txt b/openmp/libomptarget/plugins/cuda/CMakeLists.txt index 54bcdf26e9e6b..8fee1c72767a1 100644 --- a/openmp/libomptarget/plugins/cuda/CMakeLists.txt +++ b/openmp/libomptarget/plugins/cuda/CMakeLists.txt @@ -1,45 +1,45 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a CUDA machine if available. -# -##===----------------------------------------------------------------------===## -if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) - libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.") - return() -elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND) - libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") - return() -elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND) - libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") - return() -elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) - libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.") - return() -endif() - -libomptarget_say("Building CUDA offloading plugin.") - -# Define the suffix for the runtime messaging dumps. -add_definitions(-DTARGET_NAME=CUDA) - -include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) -include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) - -add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) - -# Install plugin under the lib destination folder. -install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") - -target_link_libraries(omptarget.rtl.cuda - ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} - ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") - -# Report to the parent scope that we are building a plugin for CUDA. -set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a CUDA machine if available. +# +##===----------------------------------------------------------------------===## +if (NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux")) + libomptarget_say("Not building CUDA offloading plugin: only support CUDA in Linux x86_64, ppc64le, or aarch64 hosts.") + return() +elseif (NOT LIBOMPTARGET_DEP_LIBELF_FOUND) + libomptarget_say("Not building CUDA offloading plugin: libelf dependency not found.") + return() +elseif(NOT LIBOMPTARGET_DEP_CUDA_FOUND) + libomptarget_say("Not building CUDA offloading plugin: CUDA not found in system.") + return() +elseif(NOT LIBOMPTARGET_DEP_CUDA_DRIVER_FOUND) + libomptarget_say("Not building CUDA offloading plugin: CUDA Driver API not found in system.") + return() +endif() + +libomptarget_say("Building CUDA offloading plugin.") + +# Define the suffix for the runtime messaging dumps. +add_definitions(-DTARGET_NAME=CUDA) + +include_directories(${LIBOMPTARGET_DEP_CUDA_INCLUDE_DIRS}) +include_directories(${LIBOMPTARGET_DEP_LIBELF_INCLUDE_DIRS}) + +add_library(omptarget.rtl.cuda SHARED src/rtl.cpp) + +# Install plugin under the lib destination folder. +install(TARGETS omptarget.rtl.cuda LIBRARY DESTINATION "${OPENMP_INSTALL_LIBDIR}") + +target_link_libraries(omptarget.rtl.cuda + ${LIBOMPTARGET_DEP_CUDA_DRIVER_LIBRARIES} + ${LIBOMPTARGET_DEP_LIBELF_LIBRARIES} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/../exports") + +# Report to the parent scope that we are building a plugin for CUDA. +set(LIBOMPTARGET_SYSTEM_TARGETS "${LIBOMPTARGET_SYSTEM_TARGETS} nvptx64-nvidia-cuda" PARENT_SCOPE) diff --git a/openmp/libomptarget/plugins/cuda/src/rtl.cpp b/openmp/libomptarget/plugins/cuda/src/rtl.cpp index 4ad58e290252d..9e3f1e0b35bea 100644 --- a/openmp/libomptarget/plugins/cuda/src/rtl.cpp +++ b/openmp/libomptarget/plugins/cuda/src/rtl.cpp @@ -1,1042 +1,1088 @@ -//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for CUDA machine -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "omptargetplugin.h" - -#ifndef TARGET_NAME -#define TARGET_NAME CUDA -#endif - -#ifdef OMPTARGET_DEBUG -static int DebugLevel = 0; - -#define GETNAME2(name) #name -#define GETNAME(name) GETNAME2(name) -#define DP(...) \ - do { \ - if (DebugLevel > 0) { \ - DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ - } \ - } while (false) - -// Utility for retrieving and printing CUDA error string. -#define CUDA_ERR_STRING(err) \ - do { \ - if (DebugLevel > 0) { \ - const char *errStr; \ - cuGetErrorString(err, &errStr); \ - DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \ - } \ - } while (false) -#else // OMPTARGET_DEBUG -#define DP(...) {} -#define CUDA_ERR_STRING(err) {} -#endif // OMPTARGET_DEBUG - -#include "../../common/elf_common.c" - -/// Keep entries table per device. -struct FuncOrGblEntryTy { - __tgt_target_table Table; - std::vector<__tgt_offload_entry> Entries; -}; - -enum ExecutionModeType { - SPMD, // constructors, destructors, - // combined constructs (`teams distribute parallel for [simd]`) - GENERIC, // everything else - NONE -}; - -/// Use a single entity to encode a kernel and a set of flags. -struct KernelTy { - CUfunction Func; - - // execution mode of kernel - // 0 - SPMD mode (without master warp) - // 1 - Generic mode (with master warp) - int8_t ExecutionMode; - - KernelTy(CUfunction _Func, int8_t _ExecutionMode) - : Func(_Func), ExecutionMode(_ExecutionMode) {} -}; - -/// Device environment data -/// Manually sync with the deviceRTL side for now, move to a dedicated header -/// file later. -struct omptarget_device_environmentTy { - int32_t debug_level; -}; - -/// List that contains all the kernels. -/// FIXME: we may need this to be per device and per library. -std::list KernelsList; - -namespace { -bool checkResult(CUresult Err, const char *ErrMsg) { - if (Err == CUDA_SUCCESS) - return true; - - DP(ErrMsg); - CUDA_ERR_STRING(Err); - return false; -} - -// Structure contains per-device data -struct DeviceDataTy { - std::list FuncGblEntries; - CUcontext Context = nullptr; - // Device properties - int ThreadsPerBlock = 0; - int BlocksPerGrid = 0; - int WarpSize = 0; - // OpenMP properties - int NumTeams = 0; - int NumThreads = 0; -}; - -class StreamManagerTy { - int NumberOfDevices; - // The initial size of stream pool - int EnvNumInitialStreams; - // Per-device stream mutex - std::vector> StreamMtx; - // Per-device stream Id indicates the next available stream in the pool - std::vector NextStreamId; - // Per-device stream pool - std::vector> StreamPool; - // Reference to per-device data - std::vector &DeviceData; - - // If there is no CUstream left in the pool, we will resize the pool to - // allocate more CUstream. This function should be called with device mutex, - // and we do not resize to smaller one. - void resizeStreamPool(const int DeviceId, const size_t NewSize) { - std::vector &Pool = StreamPool[DeviceId]; - const size_t CurrentSize = Pool.size(); - assert(NewSize > CurrentSize && "new size is not larger than current size"); - - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { - // We will return if cannot switch to the right context in case of - // creating bunch of streams that are not corresponding to the right - // device. The offloading will fail later because selected CUstream is - // nullptr. - return; - } - - Pool.resize(NewSize, nullptr); - - for (size_t I = CurrentSize; I < NewSize; ++I) { - checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), - "Error returned from cuStreamCreate\n"); - } - } - -public: - StreamManagerTy(const int NumberOfDevices, - std::vector &DeviceData) - : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), - DeviceData(DeviceData) { - StreamPool.resize(NumberOfDevices); - NextStreamId.resize(NumberOfDevices); - StreamMtx.resize(NumberOfDevices); - - if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) - EnvNumInitialStreams = std::stoi(EnvStr); - - // Initialize the next stream id - std::fill(NextStreamId.begin(), NextStreamId.end(), 0); - - // Initialize stream mutex - for (std::unique_ptr &Ptr : StreamMtx) - Ptr = std::make_unique(); - } - - ~StreamManagerTy() { - // Destroy streams - for (int I = 0; I < NumberOfDevices; ++I) { - checkResult(cuCtxSetCurrent(DeviceData[I].Context), - "Error returned from cuCtxSetCurrent\n"); - - for (CUstream &S : StreamPool[I]) { - if (S) - checkResult(cuStreamDestroy(S), - "Error returned from cuStreamDestroy\n"); - } - } - } - - // Get a CUstream from pool. Per-device next stream id always points to the - // next available CUstream. That means, CUstreams [0, id-1] have been - // assigned, and [id,] are still available. If there is no CUstream left, we - // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, - // the id will increase one. - // xxxxxs+++++++++ - // ^ - // id - // After assignment, the pool becomes the following and s is assigned. - // xxxxxs+++++++++ - // ^ - // id - CUstream getStream(const int DeviceId) { - const std::lock_guard Lock(*StreamMtx[DeviceId]); - int &Id = NextStreamId[DeviceId]; - // No CUstream left in the pool, we need to request from CUDA RT - if (Id == StreamPool[DeviceId].size()) { - // By default we double the stream pool every time - resizeStreamPool(DeviceId, Id * 2); - } - return StreamPool[DeviceId][Id++]; - } - - // Return a CUstream back to pool. As mentioned above, per-device next - // stream is always points to the next available CUstream, so when we return - // a CUstream, we need to first decrease the id, and then copy the CUstream - // back. - // It is worth noting that, the order of streams return might be different - // from that they're assigned, that saying, at some point, there might be - // two identical CUstreams. - // xxax+a+++++ - // ^ - // id - // However, it doesn't matter, because they're always on the two sides of - // id. The left one will in the end be overwritten by another CUstream. - // Therefore, after several execution, the order of pool might be different - // from its initial state. - void returnStream(const int DeviceId, CUstream Stream) { - const std::lock_guard Lock(*StreamMtx[DeviceId]); - int &Id = NextStreamId[DeviceId]; - assert(Id > 0 && "Wrong stream ID"); - StreamPool[DeviceId][--Id] = Stream; - } - - bool initializeDeviceStreamPool(const int DeviceId) { - assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); - - resizeStreamPool(DeviceId, EnvNumInitialStreams); - - // Check the size of stream pool - if (StreamPool[DeviceId].size() != EnvNumInitialStreams) - return false; - - // Check whether each stream is valid - for (CUstream &S : StreamPool[DeviceId]) - if (!S) - return false; - - return true; - } -}; - -class DeviceRTLTy { - int NumberOfDevices; - // OpenMP environment properties - int EnvNumTeams; - int EnvTeamLimit; - // OpenMP requires flags - int64_t RequiresFlags; - - static constexpr const int HardTeamLimit = 1U << 16U; // 64k - static constexpr const int HardThreadLimit = 1024; - static constexpr const int DefaultNumTeams = 128; - static constexpr const int DefaultNumThreads = 128; - - std::unique_ptr StreamManager; - std::vector DeviceData; - std::vector Modules; - - // Record entry point associated with device - void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - E.Entries.push_back(entry); - } - - // Return true if the entry is associated with device - bool findOffloadEntry(const int DeviceId, const void *Addr) const { - for (const __tgt_offload_entry &Itr : - DeviceData[DeviceId].FuncGblEntries.back().Entries) - if (Itr.addr == Addr) - return true; - - return false; - } - - // Return the pointer to the target entries table - __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - - if (E.Entries.empty()) - return nullptr; - - // Update table info according to the entries and return the pointer - E.Table.EntriesBegin = E.Entries.data(); - E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); - - return &E.Table; - } - - // Clear entries table for a device - void clearOffloadEntriesTable(const int DeviceId) { - DeviceData[DeviceId].FuncGblEntries.emplace_back(); - FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); - E.Entries.clear(); - E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; - } - - CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { - assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); - - if (!AsyncInfoPtr->Queue) - AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); - - return reinterpret_cast(AsyncInfoPtr->Queue); - } - -public: - // This class should not be copied - DeviceRTLTy(const DeviceRTLTy &) = delete; - DeviceRTLTy(DeviceRTLTy &&) = delete; - - DeviceRTLTy() - : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), - RequiresFlags(OMP_REQ_UNDEFINED) { -#ifdef OMPTARGET_DEBUG - if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG")) - DebugLevel = std::stoi(EnvStr); -#endif // OMPTARGET_DEBUG - - DP("Start initializing CUDA\n"); - - CUresult Err = cuInit(0); - if (!checkResult(Err, "Error returned from cuInit\n")) { - return; - } - - Err = cuDeviceGetCount(&NumberOfDevices); - if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) - return; - - if (NumberOfDevices == 0) { - DP("There are no devices supporting CUDA.\n"); - return; - } - - DeviceData.resize(NumberOfDevices); - - // Get environment variables regarding teams - if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { - // OMP_TEAM_LIMIT has been set - EnvTeamLimit = std::stoi(EnvStr); - DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); - } - if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { - // OMP_NUM_TEAMS has been set - EnvNumTeams = std::stoi(EnvStr); - DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); - } - - StreamManager = - std::make_unique(NumberOfDevices, DeviceData); - } - - ~DeviceRTLTy() { - // First destruct stream manager in case of Contexts is destructed before it - StreamManager = nullptr; - - for (CUmodule &M : Modules) - // Close module - if (M) - checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); - - for (DeviceDataTy &D : DeviceData) { - // Destroy context - if (D.Context) - checkResult(cuCtxDestroy(D.Context), - "Error returned from cuCtxDestroy\n"); - } - } - - // Check whether a given DeviceId is valid - bool isValidDeviceId(const int DeviceId) const { - return DeviceId >= 0 && DeviceId < NumberOfDevices; - } - - bool getNumOfDevices() const { return NumberOfDevices; } - - void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } - - int initDevice(const int DeviceId) { - CUdevice Device; - - DP("Getting device %d\n", DeviceId); - CUresult Err = cuDeviceGet(&Device, DeviceId); - if (!checkResult(Err, "Error returned from cuDeviceGet\n")) - return OFFLOAD_FAIL; - - // Create the context and save it to use whenever this device is selected. - Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC, - Device); - if (!checkResult(Err, "Error returned from cuCtxCreate\n")) - return OFFLOAD_FAIL; - - Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - // Initialize stream pool - if (!StreamManager->initializeDeviceStreamPool(DeviceId)) - return OFFLOAD_FAIL; - - // Query attributes to determine number of threads/block and blocks/grid. - int MaxGridDimX; - Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, - Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting max grid dimension, use default value %d\n", - DeviceRTLTy::DefaultNumTeams); - DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; - } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { - DP("Using %d CUDA blocks per grid\n", MaxGridDimX); - DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; - } else { - DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " - "at the hard limit\n", - MaxGridDimX, DeviceRTLTy::HardTeamLimit); - DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; - } - - // We are only exploiting threads along the x axis. - int MaxBlockDimX; - Err = cuDeviceGetAttribute(&MaxBlockDimX, - CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting max block dimension, use default value %d\n", - DeviceRTLTy::DefaultNumThreads); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; - } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { - DP("Using %d CUDA threads per block\n", MaxBlockDimX); - DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; - } else { - DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " - "capping at the hard limit\n", - MaxBlockDimX, DeviceRTLTy::HardThreadLimit); - DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; - } - - // Get and set warp size - int WarpSize; - Err = - cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); - if (Err != CUDA_SUCCESS) { - DP("Error getting warp size, assume default value 32\n"); - DeviceData[DeviceId].WarpSize = 32; - } else { - DP("Using warp size %d\n", WarpSize); - DeviceData[DeviceId].WarpSize = WarpSize; - } - - // Adjust teams to the env variables - if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { - DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", - EnvTeamLimit); - DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; - } - - DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", - DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, - DeviceData[DeviceId].WarpSize); - - // Set default number of teams - if (EnvNumTeams > 0) { - DP("Default number of teams set according to environment %d\n", - EnvNumTeams); - DeviceData[DeviceId].NumTeams = EnvNumTeams; - } else { - DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; - DP("Default number of teams set according to library's default %d\n", - DeviceRTLTy::DefaultNumTeams); - } - - if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { - DP("Default number of teams exceeds device limit, capping at %d\n", - DeviceData[DeviceId].BlocksPerGrid); - DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; - } - - // Set default number of threads - DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; - DP("Default number of threads set according to library's default %d\n", - DeviceRTLTy::DefaultNumThreads); - if (DeviceData[DeviceId].NumThreads > - DeviceData[DeviceId].ThreadsPerBlock) { - DP("Default number of threads exceeds device limit, capping at %d\n", - DeviceData[DeviceId].ThreadsPerBlock); - DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; - } - - return OFFLOAD_SUCCESS; - } - - __tgt_target_table *loadBinary(const int DeviceId, - const __tgt_device_image *Image) { - // Set the context we are using - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return nullptr; - - // Clear the offload table as we are going to create a new one. - clearOffloadEntriesTable(DeviceId); - - // Create the module and extract the function pointers. - CUmodule Module; - DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); - Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); - if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) - return nullptr; - - DP("CUDA module successfully loaded!\n"); - - Modules.push_back(Module); - - // Find the symbols in the module by name. - const __tgt_offload_entry *HostBegin = Image->EntriesBegin; - const __tgt_offload_entry *HostEnd = Image->EntriesEnd; - - for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { - if (!E->addr) { - // We return nullptr when something like this happens, the host should - // have always something in the address to uniquely identify the target - // region. - DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); - return nullptr; - } - - if (E->size) { - __tgt_offload_entry Entry = *E; - CUdeviceptr CUPtr; - size_t CUSize; - Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - DP("Loading global '%s' (Failed)\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - if (CUSize != E->size) { - DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, - CUSize, E->size); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); - - Entry.addr = (void *)(CUPtr); - - // Note: In the current implementation declare target variables - // can either be link or to. This means that once unified - // memory is activated via the requires directive, the variable - // can be used directly from the host in both cases. - // TODO: when variables types other than to or link are added, - // the below condition should be changed to explicitly - // check for to and link variables types: - // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & - // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) - if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { - // If unified memory is present any target link or to variables - // can access host addresses directly. There is no longer a - // need for device copies. - cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); - DP("Copy linked variable host address (" DPxMOD - ") to device address (" DPxMOD ")\n", - DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); - } - - addOffloadEntry(DeviceId, Entry); - - continue; - } - - CUfunction Func; - Err = cuModuleGetFunction(&Func, Module, E->name); - // We keep this style here because we need the name - if (Err != CUDA_SUCCESS) { - DP("Loading '%s' (Failed)\n", E->name); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", - DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); - - // default value GENERIC (in case symbol is missing from cubin file) - int8_t ExecModeVal = ExecutionModeType::GENERIC; - std::string ExecModeNameStr(E->name); - ExecModeNameStr += "_exec_mode"; - const char *ExecModeName = ExecModeNameStr.c_str(); - - CUdeviceptr ExecModePtr; - size_t CUSize; - Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(int8_t)) { - DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", - ExecModeName, CUSize, sizeof(int8_t)); - return nullptr; - } - - Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from device to host. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", - DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } - - if (ExecModeVal < 0 || ExecModeVal > 1) { - DP("Error wrong exec_mode value specified in cubin file: %d\n", - ExecModeVal); - return nullptr; - } - } else { - DP("Loading global exec_mode '%s' - symbol missing, using default " - "value GENERIC (1)\n", - ExecModeName); - CUDA_ERR_STRING(Err); - } - - KernelsList.emplace_back(Func, ExecModeVal); - - __tgt_offload_entry Entry = *E; - Entry.addr = &KernelsList.back(); - addOffloadEntry(DeviceId, Entry); - } - - // send device environment data to the device - { - omptarget_device_environmentTy DeviceEnv{0}; - -#ifdef OMPTARGET_DEBUG - if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) - DeviceEnv.debug_level = std::stoi(EnvStr); -#endif - - const char *DeviceEnvName = "omptarget_device_environment"; - CUdeviceptr DeviceEnvPtr; - size_t CUSize; - - Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); - if (Err == CUDA_SUCCESS) { - if (CUSize != sizeof(DeviceEnv)) { - DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", - DeviceEnvName, CUSize, sizeof(int32_t)); - CUDA_ERR_STRING(Err); - return nullptr; - } - - Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from host to device. Pointers: " - "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", - DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); - CUDA_ERR_STRING(Err); - return nullptr; - } - - DP("Sending global device environment data %zu bytes\n", CUSize); - } else { - DP("Finding global device environment '%s' - symbol missing.\n", - DeviceEnvName); - DP("Continue, considering this is a device RTL which does not accept " - "environment setting.\n"); - } - } - - return getOffloadEntriesTable(DeviceId); - } - - void *dataAlloc(const int DeviceId, const int64_t Size) const { - if (Size == 0) - return nullptr; - - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return nullptr; - - CUdeviceptr DevicePtr; - Err = cuMemAlloc(&DevicePtr, Size); - if (!checkResult(Err, "Error returned from cuMemAlloc\n")) - return nullptr; - - return (void *)DevicePtr; - } - - int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, - const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { - assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); - - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - CUstream Stream = getStream(DeviceId, AsyncInfoPtr); - - Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from host to device. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, - const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { - assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); - - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - CUstream Stream = getStream(DeviceId, AsyncInfoPtr); - - Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when copying data from device to host. Pointers: host = " DPxMOD - ", device = " DPxMOD ", size = %" PRId64 "\n", - DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; - } - - int dataDelete(const int DeviceId, void *TgtPtr) const { - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - Err = cuMemFree((CUdeviceptr)TgtPtr); - if (!checkResult(Err, "Error returned from cuMemFree\n")) - return OFFLOAD_FAIL; - - return OFFLOAD_SUCCESS; - } - - int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr, - void **TgtArgs, ptrdiff_t *TgtOffsets, - const int ArgNum, const int TeamNum, - const int ThreadLimit, - const unsigned int LoopTripCount, - __tgt_async_info *AsyncInfo) const { - CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); - if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) - return OFFLOAD_FAIL; - - // All args are references. - std::vector Args(ArgNum); - std::vector Ptrs(ArgNum); - - for (int I = 0; I < ArgNum; ++I) { - Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); - Args[I] = &Ptrs[I]; - } - - const KernelTy *KernelInfo = - reinterpret_cast(TgtEntryPtr); - - unsigned int CudaThreadsPerBlock; - if (ThreadLimit > 0) { - DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); - CudaThreadsPerBlock = ThreadLimit; - // Add master warp if necessary - if (KernelInfo->ExecutionMode == GENERIC) { - DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); - CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; - } - } else { - DP("Setting CUDA threads per block to default %d\n", - DeviceData[DeviceId].NumThreads); - CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; - } - - if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { - DP("Threads per block capped at device limit %d\n", - DeviceData[DeviceId].ThreadsPerBlock); - CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; - } - - int KernelLimit; - Err = cuFuncGetAttribute(&KernelLimit, - CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, - KernelInfo->Func); - if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) { - DP("Threads per block capped at kernel limit %d\n", KernelLimit); - CudaThreadsPerBlock = KernelLimit; - } - - unsigned int CudaBlocksPerGrid; - if (TeamNum <= 0) { - if (LoopTripCount > 0 && EnvNumTeams < 0) { - if (KernelInfo->ExecutionMode == SPMD) { - // We have a combined construct, i.e. `target teams distribute - // parallel for [simd]`. We launch so many teams so that each thread - // will execute one iteration of the loop. round up to the nearest - // integer - CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; - } else { - // If we reach this point, then we have a non-combined construct, i.e. - // `teams distribute` with a nested `parallel for` and each team is - // assigned one iteration of the `distribute` loop. E.g.: - // - // #pragma omp target teams distribute - // for(...loop_tripcount...) { - // #pragma omp parallel for - // for(...) {} - // } - // - // Threads within a team will execute the iterations of the `parallel` - // loop. - CudaBlocksPerGrid = LoopTripCount; - } - DP("Using %d teams due to loop trip count %" PRIu64 - " and number of threads per block %d\n", - CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); - } else { - DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); - CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; - } - } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { - DP("Capping number of teams to team limit %d\n", - DeviceData[DeviceId].BlocksPerGrid); - CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; - } else { - DP("Using requested number of teams %d\n", TeamNum); - CudaBlocksPerGrid = TeamNum; - } - - // Run on the device. - DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid, - CudaThreadsPerBlock); - - CUstream Stream = getStream(DeviceId, AsyncInfo); - Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, - /* gridDimZ */ 1, CudaThreadsPerBlock, - /* blockDimY */ 1, /* blockDimZ */ 1, - /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); - if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) - return OFFLOAD_FAIL; - - DP("Launch of entry point at " DPxMOD " successful!\n", - DPxPTR(TgtEntryPtr)); - - return OFFLOAD_SUCCESS; - } - - int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { - CUstream Stream = reinterpret_cast(AsyncInfoPtr->Queue); - CUresult Err = cuStreamSynchronize(Stream); - if (Err != CUDA_SUCCESS) { - DP("Error when synchronizing stream. stream = " DPxMOD - ", async info ptr = " DPxMOD "\n", - DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); - CUDA_ERR_STRING(Err); - return OFFLOAD_FAIL; - } - - // Once the stream is synchronized, return it to stream pool and reset - // async_info. This is to make sure the synchronization only works for its - // own tasks. - StreamManager->returnStream( - DeviceId, reinterpret_cast(AsyncInfoPtr->Queue)); - AsyncInfoPtr->Queue = nullptr; - - return OFFLOAD_SUCCESS; - } -}; - -DeviceRTLTy DeviceRTL; -} // namespace - -// Exposed library API function -#ifdef __cplusplus -extern "C" { -#endif - -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { - return elf_check_machine(image, /* EM_CUDA */ 190); -} - -int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } - -int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { - DP("Init requires flags to %ld\n", RequiresFlags); - DeviceRTL.setRequiresFlag(RequiresFlags); - return RequiresFlags; -} - -int32_t __tgt_rtl_init_device(int32_t device_id) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - return DeviceRTL.initDevice(device_id); -} - -__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, - __tgt_device_image *image) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - return DeviceRTL.loadBinary(device_id, image); -} - -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - return DeviceRTL.dataAlloc(device_id, size); -} - -int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - __tgt_async_info async_info; - const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, - size, &async_info); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(device_id, &async_info); -} - -int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, - void *hst_ptr, int64_t size, - __tgt_async_info *async_info_ptr) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - assert(async_info_ptr && "async_info_ptr is nullptr"); - - return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, - async_info_ptr); -} - -int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - __tgt_async_info async_info; - const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, - size, &async_info); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(device_id, &async_info); -} - -int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, - void *tgt_ptr, int64_t size, - __tgt_async_info *async_info_ptr) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - assert(async_info_ptr && "async_info_ptr is nullptr"); - - return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, - async_info_ptr); -} - -int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - return DeviceRTL.dataDelete(device_id, tgt_ptr); -} - -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, - ptrdiff_t *tgt_offsets, - int32_t arg_num, int32_t team_num, - int32_t thread_limit, - uint64_t loop_tripcount) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - __tgt_async_info async_info; - const int32_t rc = __tgt_rtl_run_target_team_region_async( - device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, - thread_limit, loop_tripcount, &async_info); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(device_id, &async_info); -} - -int32_t __tgt_rtl_run_target_team_region_async( - int32_t device_id, void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, - int32_t thread_limit, uint64_t loop_tripcount, - __tgt_async_info *async_info_ptr) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - return DeviceRTL.runTargetTeamRegion( - device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, - thread_limit, loop_tripcount, async_info_ptr); -} - -int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, - int32_t arg_num) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - __tgt_async_info async_info; - const int32_t rc = __tgt_rtl_run_target_region_async( - device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); - if (rc != OFFLOAD_SUCCESS) - return OFFLOAD_FAIL; - - return __tgt_rtl_synchronize(device_id, &async_info); -} - -int32_t __tgt_rtl_run_target_region_async(int32_t device_id, - void *tgt_entry_ptr, void **tgt_args, - ptrdiff_t *tgt_offsets, - int32_t arg_num, - __tgt_async_info *async_info_ptr) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - - return __tgt_rtl_run_target_team_region_async( - device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, - /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, - async_info_ptr); -} - -int32_t __tgt_rtl_synchronize(int32_t device_id, - __tgt_async_info *async_info_ptr) { - assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); - assert(async_info_ptr && "async_info_ptr is nullptr"); - assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); - - return DeviceRTL.synchronize(device_id, async_info_ptr); -} - -#ifdef __cplusplus -} -#endif +//===----RTLs/cuda/src/rtl.cpp - Target RTLs Implementation ------- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for CUDA machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME CUDA +#endif + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) + +// Utility for retrieving and printing CUDA error string. +#define CUDA_ERR_STRING(err) \ + do { \ + if (DebugLevel > 0) { \ + const char *errStr; \ + cuGetErrorString(err, &errStr); \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", "CUDA error is: %s\n", errStr); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#define CUDA_ERR_STRING(err) {} +#endif // OMPTARGET_DEBUG + +#include "../../common/elf_common.c" + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; + std::vector<__tgt_offload_entry> Entries; +}; + +enum ExecutionModeType { + SPMD, // constructors, destructors, + // combined constructs (`teams distribute parallel for [simd]`) + GENERIC, // everything else + NONE +}; + +/// Use a single entity to encode a kernel and a set of flags. +struct KernelTy { + CUfunction Func; + + // execution mode of kernel + // 0 - SPMD mode (without master warp) + // 1 - Generic mode (with master warp) + int8_t ExecutionMode; + + KernelTy(CUfunction _Func, int8_t _ExecutionMode) + : Func(_Func), ExecutionMode(_ExecutionMode) {} +}; + +/// Device environment data +/// Manually sync with the deviceRTL side for now, move to a dedicated header +/// file later. +struct omptarget_device_environmentTy { + int32_t debug_level; +}; + +/// List that contains all the kernels. +/// FIXME: we may need this to be per device and per library. +std::list KernelsList; + +namespace { +bool checkResult(CUresult Err, const char *ErrMsg) { + if (Err == CUDA_SUCCESS) + return true; + + DP(ErrMsg); + CUDA_ERR_STRING(Err); + return false; +} + +// Structure contains per-device data +struct DeviceDataTy { + std::list FuncGblEntries; + CUcontext Context = nullptr; + // Device properties + int ThreadsPerBlock = 0; + int BlocksPerGrid = 0; + int WarpSize = 0; + // OpenMP properties + int NumTeams = 0; + int NumThreads = 0; +}; + +class StreamManagerTy { + int NumberOfDevices; + // The initial size of stream pool + int EnvNumInitialStreams; + // Per-device stream mutex + std::vector> StreamMtx; + // Per-device stream Id indicates the next available stream in the pool + std::vector NextStreamId; + // Per-device stream pool + std::vector> StreamPool; + // Reference to per-device data + std::vector &DeviceData; + + // If there is no CUstream left in the pool, we will resize the pool to + // allocate more CUstream. This function should be called with device mutex, + // and we do not resize to smaller one. + void resizeStreamPool(const int DeviceId, const size_t NewSize) { + std::vector &Pool = StreamPool[DeviceId]; + const size_t CurrentSize = Pool.size(); + assert(NewSize > CurrentSize && "new size is not larger than current size"); + + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) { + // We will return if cannot switch to the right context in case of + // creating bunch of streams that are not corresponding to the right + // device. The offloading will fail later because selected CUstream is + // nullptr. + return; + } + + Pool.resize(NewSize, nullptr); + + for (size_t I = CurrentSize; I < NewSize; ++I) { + checkResult(cuStreamCreate(&Pool[I], CU_STREAM_NON_BLOCKING), + "Error returned from cuStreamCreate\n"); + } + } + +public: + StreamManagerTy(const int NumberOfDevices, + std::vector &DeviceData) + : NumberOfDevices(NumberOfDevices), EnvNumInitialStreams(32), + DeviceData(DeviceData) { + StreamPool.resize(NumberOfDevices); + NextStreamId.resize(NumberOfDevices); + StreamMtx.resize(NumberOfDevices); + + if (const char *EnvStr = getenv("LIBOMPTARGET_NUM_INITIAL_STREAMS")) + EnvNumInitialStreams = std::stoi(EnvStr); + + // Initialize the next stream id + std::fill(NextStreamId.begin(), NextStreamId.end(), 0); + + // Initialize stream mutex + for (std::unique_ptr &Ptr : StreamMtx) + Ptr = std::make_unique(); + } + + ~StreamManagerTy() { + // Destroy streams + for (int I = 0; I < NumberOfDevices; ++I) { + checkResult(cuCtxSetCurrent(DeviceData[I].Context), + "Error returned from cuCtxSetCurrent\n"); + + for (CUstream &S : StreamPool[I]) { + if (S) + checkResult(cuStreamDestroy(S), + "Error returned from cuStreamDestroy\n"); + } + } + } + + // Get a CUstream from pool. Per-device next stream id always points to the + // next available CUstream. That means, CUstreams [0, id-1] have been + // assigned, and [id,] are still available. If there is no CUstream left, we + // will ask more CUstreams from CUDA RT. Each time a CUstream is assigned, + // the id will increase one. + // xxxxxs+++++++++ + // ^ + // id + // After assignment, the pool becomes the following and s is assigned. + // xxxxxs+++++++++ + // ^ + // id + CUstream getStream(const int DeviceId) { + const std::lock_guard Lock(*StreamMtx[DeviceId]); + int &Id = NextStreamId[DeviceId]; + // No CUstream left in the pool, we need to request from CUDA RT + if (Id == StreamPool[DeviceId].size()) { + // By default we double the stream pool every time + resizeStreamPool(DeviceId, Id * 2); + } + return StreamPool[DeviceId][Id++]; + } + + // Return a CUstream back to pool. As mentioned above, per-device next + // stream is always points to the next available CUstream, so when we return + // a CUstream, we need to first decrease the id, and then copy the CUstream + // back. + // It is worth noting that, the order of streams return might be different + // from that they're assigned, that saying, at some point, there might be + // two identical CUstreams. + // xxax+a+++++ + // ^ + // id + // However, it doesn't matter, because they're always on the two sides of + // id. The left one will in the end be overwritten by another CUstream. + // Therefore, after several execution, the order of pool might be different + // from its initial state. + void returnStream(const int DeviceId, CUstream Stream) { + const std::lock_guard Lock(*StreamMtx[DeviceId]); + int &Id = NextStreamId[DeviceId]; + assert(Id > 0 && "Wrong stream ID"); + StreamPool[DeviceId][--Id] = Stream; + } + + bool initializeDeviceStreamPool(const int DeviceId) { + assert(StreamPool[DeviceId].empty() && "stream pool has been initialized"); + + resizeStreamPool(DeviceId, EnvNumInitialStreams); + + // Check the size of stream pool + if (StreamPool[DeviceId].size() != EnvNumInitialStreams) + return false; + + // Check whether each stream is valid + for (CUstream &S : StreamPool[DeviceId]) + if (!S) + return false; + + return true; + } +}; + +class DeviceRTLTy { + int NumberOfDevices; + // OpenMP environment properties + int EnvNumTeams; + int EnvTeamLimit; + // OpenMP requires flags + int64_t RequiresFlags; + + static constexpr const int HardTeamLimit = 1U << 16U; // 64k + static constexpr const int HardThreadLimit = 1024; + static constexpr const int DefaultNumTeams = 128; + static constexpr const int DefaultNumThreads = 128; + + std::unique_ptr StreamManager; + std::vector DeviceData; + std::vector Modules; + + // Record entry point associated with device + void addOffloadEntry(const int DeviceId, const __tgt_offload_entry entry) { + FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); + E.Entries.push_back(entry); + } + + // Return true if the entry is associated with device + bool findOffloadEntry(const int DeviceId, const void *Addr) const { + for (const __tgt_offload_entry &Itr : + DeviceData[DeviceId].FuncGblEntries.back().Entries) + if (Itr.addr == Addr) + return true; + + return false; + } + + // Return the pointer to the target entries table + __tgt_target_table *getOffloadEntriesTable(const int DeviceId) { + FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); + + if (E.Entries.empty()) + return nullptr; + + // Update table info according to the entries and return the pointer + E.Table.EntriesBegin = E.Entries.data(); + E.Table.EntriesEnd = E.Entries.data() + E.Entries.size(); + + return &E.Table; + } + + // Clear entries table for a device + void clearOffloadEntriesTable(const int DeviceId) { + DeviceData[DeviceId].FuncGblEntries.emplace_back(); + FuncOrGblEntryTy &E = DeviceData[DeviceId].FuncGblEntries.back(); + E.Entries.clear(); + E.Table.EntriesBegin = E.Table.EntriesEnd = nullptr; + } + + CUstream getStream(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + + if (!AsyncInfoPtr->Queue) + AsyncInfoPtr->Queue = StreamManager->getStream(DeviceId); + + return reinterpret_cast(AsyncInfoPtr->Queue); + } + +public: + // This class should not be copied + DeviceRTLTy(const DeviceRTLTy &) = delete; + DeviceRTLTy(DeviceRTLTy &&) = delete; + + DeviceRTLTy() + : NumberOfDevices(0), EnvNumTeams(-1), EnvTeamLimit(-1), + RequiresFlags(OMP_REQ_UNDEFINED) { +#ifdef OMPTARGET_DEBUG + if (const char *EnvStr = getenv("LIBOMPTARGET_DEBUG")) + DebugLevel = std::stoi(EnvStr); +#endif // OMPTARGET_DEBUG + + DP("Start initializing CUDA\n"); + + CUresult Err = cuInit(0); + if (!checkResult(Err, "Error returned from cuInit\n")) { + return; + } + + Err = cuDeviceGetCount(&NumberOfDevices); + if (!checkResult(Err, "Error returned from cuDeviceGetCount\n")) + return; + + if (NumberOfDevices == 0) { + DP("There are no devices supporting CUDA.\n"); + return; + } + + DeviceData.resize(NumberOfDevices); + + // Get environment variables regarding teams + if (const char *EnvStr = getenv("OMP_TEAM_LIMIT")) { + // OMP_TEAM_LIMIT has been set + EnvTeamLimit = std::stoi(EnvStr); + DP("Parsed OMP_TEAM_LIMIT=%d\n", EnvTeamLimit); + } + if (const char *EnvStr = getenv("OMP_NUM_TEAMS")) { + // OMP_NUM_TEAMS has been set + EnvNumTeams = std::stoi(EnvStr); + DP("Parsed OMP_NUM_TEAMS=%d\n", EnvNumTeams); + } + + StreamManager = + std::make_unique(NumberOfDevices, DeviceData); + } + + ~DeviceRTLTy() { + // First destruct stream manager in case of Contexts is destructed before it + StreamManager = nullptr; + + for (CUmodule &M : Modules) + // Close module + if (M) + checkResult(cuModuleUnload(M), "Error returned from cuModuleUnload\n"); + + for (DeviceDataTy &D : DeviceData) { + // Destroy context + if (D.Context) + checkResult(cuCtxDestroy(D.Context), + "Error returned from cuCtxDestroy\n"); + } + } + + // Check whether a given DeviceId is valid + bool isValidDeviceId(const int DeviceId) const { + return DeviceId >= 0 && DeviceId < NumberOfDevices; + } + + bool getNumOfDevices() const { return NumberOfDevices; } + + void setRequiresFlag(const int64_t Flags) { this->RequiresFlags = Flags; } + + int initDevice(const int DeviceId) { + CUdevice Device; + + DP("Getting device %d\n", DeviceId); + CUresult Err = cuDeviceGet(&Device, DeviceId); + if (!checkResult(Err, "Error returned from cuDeviceGet\n")) + return OFFLOAD_FAIL; + + // Create the context and save it to use whenever this device is selected. + Err = cuCtxCreate(&DeviceData[DeviceId].Context, CU_CTX_SCHED_BLOCKING_SYNC, + Device); + if (!checkResult(Err, "Error returned from cuCtxCreate\n")) + return OFFLOAD_FAIL; + + Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + // Initialize stream pool + if (!StreamManager->initializeDeviceStreamPool(DeviceId)) + return OFFLOAD_FAIL; + + // Query attributes to determine number of threads/block and blocks/grid. + int MaxGridDimX; + Err = cuDeviceGetAttribute(&MaxGridDimX, CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, + Device); + if (Err != CUDA_SUCCESS) { + DP("Error getting max grid dimension, use default value %d\n", + DeviceRTLTy::DefaultNumTeams); + DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::DefaultNumTeams; + } else if (MaxGridDimX <= DeviceRTLTy::HardTeamLimit) { + DP("Using %d CUDA blocks per grid\n", MaxGridDimX); + DeviceData[DeviceId].BlocksPerGrid = MaxGridDimX; + } else { + DP("Max CUDA blocks per grid %d exceeds the hard team limit %d, capping " + "at the hard limit\n", + MaxGridDimX, DeviceRTLTy::HardTeamLimit); + DeviceData[DeviceId].BlocksPerGrid = DeviceRTLTy::HardTeamLimit; + } + + // We are only exploiting threads along the x axis. + int MaxBlockDimX; + Err = cuDeviceGetAttribute(&MaxBlockDimX, + CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, Device); + if (Err != CUDA_SUCCESS) { + DP("Error getting max block dimension, use default value %d\n", + DeviceRTLTy::DefaultNumThreads); + DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::DefaultNumThreads; + } else if (MaxBlockDimX <= DeviceRTLTy::HardThreadLimit) { + DP("Using %d CUDA threads per block\n", MaxBlockDimX); + DeviceData[DeviceId].ThreadsPerBlock = MaxBlockDimX; + } else { + DP("Max CUDA threads per block %d exceeds the hard thread limit %d, " + "capping at the hard limit\n", + MaxBlockDimX, DeviceRTLTy::HardThreadLimit); + DeviceData[DeviceId].ThreadsPerBlock = DeviceRTLTy::HardThreadLimit; + } + + // Get and set warp size + int WarpSize; + Err = + cuDeviceGetAttribute(&WarpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, Device); + if (Err != CUDA_SUCCESS) { + DP("Error getting warp size, assume default value 32\n"); + DeviceData[DeviceId].WarpSize = 32; + } else { + DP("Using warp size %d\n", WarpSize); + DeviceData[DeviceId].WarpSize = WarpSize; + } + + // Adjust teams to the env variables + if (EnvTeamLimit > 0 && DeviceData[DeviceId].BlocksPerGrid > EnvTeamLimit) { + DP("Capping max CUDA blocks per grid to OMP_TEAM_LIMIT=%d\n", + EnvTeamLimit); + DeviceData[DeviceId].BlocksPerGrid = EnvTeamLimit; + } + + DP("Max number of CUDA blocks %d, threads %d & warp size %d\n", + DeviceData[DeviceId].BlocksPerGrid, DeviceData[DeviceId].ThreadsPerBlock, + DeviceData[DeviceId].WarpSize); + + // Set default number of teams + if (EnvNumTeams > 0) { + DP("Default number of teams set according to environment %d\n", + EnvNumTeams); + DeviceData[DeviceId].NumTeams = EnvNumTeams; + } else { + DeviceData[DeviceId].NumTeams = DeviceRTLTy::DefaultNumTeams; + DP("Default number of teams set according to library's default %d\n", + DeviceRTLTy::DefaultNumTeams); + } + + if (DeviceData[DeviceId].NumTeams > DeviceData[DeviceId].BlocksPerGrid) { + DP("Default number of teams exceeds device limit, capping at %d\n", + DeviceData[DeviceId].BlocksPerGrid); + DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].BlocksPerGrid; + } + + // Set default number of threads + DeviceData[DeviceId].NumThreads = DeviceRTLTy::DefaultNumThreads; + DP("Default number of threads set according to library's default %d\n", + DeviceRTLTy::DefaultNumThreads); + if (DeviceData[DeviceId].NumThreads > + DeviceData[DeviceId].ThreadsPerBlock) { + DP("Default number of threads exceeds device limit, capping at %d\n", + DeviceData[DeviceId].ThreadsPerBlock); + DeviceData[DeviceId].NumTeams = DeviceData[DeviceId].ThreadsPerBlock; + } + + return OFFLOAD_SUCCESS; + } + + __tgt_target_table *loadBinary(const int DeviceId, + const __tgt_device_image *Image) { + // Set the context we are using + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return nullptr; + + // Clear the offload table as we are going to create a new one. + clearOffloadEntriesTable(DeviceId); + + // Create the module and extract the function pointers. + CUmodule Module; + DP("Load data from image " DPxMOD "\n", DPxPTR(Image->ImageStart)); + Err = cuModuleLoadDataEx(&Module, Image->ImageStart, 0, nullptr, nullptr); + if (!checkResult(Err, "Error returned from cuModuleLoadDataEx\n")) + return nullptr; + + DP("CUDA module successfully loaded!\n"); + + Modules.push_back(Module); + + // Find the symbols in the module by name. + const __tgt_offload_entry *HostBegin = Image->EntriesBegin; + const __tgt_offload_entry *HostEnd = Image->EntriesEnd; + + for (const __tgt_offload_entry *E = HostBegin; E != HostEnd; ++E) { + if (!E->addr) { + // We return nullptr when something like this happens, the host should + // have always something in the address to uniquely identify the target + // region. + DP("Invalid binary: host entry '' (size = %zd)...\n", E->size); + return nullptr; + } + + if (E->size) { + __tgt_offload_entry Entry = *E; + CUdeviceptr CUPtr; + size_t CUSize; + Err = cuModuleGetGlobal(&CUPtr, &CUSize, Module, E->name); + // We keep this style here because we need the name + if (Err != CUDA_SUCCESS) { + DP("Loading global '%s' (Failed)\n", E->name); + CUDA_ERR_STRING(Err); + return nullptr; + } + + if (CUSize != E->size) { + DP("Loading global '%s' - size mismatch (%zd != %zd)\n", E->name, + CUSize, E->size); + return nullptr; + } + + DP("Entry point " DPxMOD " maps to global %s (" DPxMOD ")\n", + DPxPTR(E - HostBegin), E->name, DPxPTR(CUPtr)); + + Entry.addr = (void *)(CUPtr); + + // Note: In the current implementation declare target variables + // can either be link or to. This means that once unified + // memory is activated via the requires directive, the variable + // can be used directly from the host in both cases. + // TODO: when variables types other than to or link are added, + // the below condition should be changed to explicitly + // check for to and link variables types: + // (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && (e->flags & + // OMP_DECLARE_TARGET_LINK || e->flags == OMP_DECLARE_TARGET_TO)) + if (RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { + // If unified memory is present any target link or to variables + // can access host addresses directly. There is no longer a + // need for device copies. + cuMemcpyHtoD(CUPtr, E->addr, sizeof(void *)); + DP("Copy linked variable host address (" DPxMOD + ") to device address (" DPxMOD ")\n", + DPxPTR(*((void **)E->addr)), DPxPTR(CUPtr)); + } + + addOffloadEntry(DeviceId, Entry); + + continue; + } + + CUfunction Func; + Err = cuModuleGetFunction(&Func, Module, E->name); + // We keep this style here because we need the name + if (Err != CUDA_SUCCESS) { + DP("Loading '%s' (Failed)\n", E->name); + CUDA_ERR_STRING(Err); + return nullptr; + } + + DP("Entry point " DPxMOD " maps to %s (" DPxMOD ")\n", + DPxPTR(E - HostBegin), E->name, DPxPTR(Func)); + + // default value GENERIC (in case symbol is missing from cubin file) + int8_t ExecModeVal = ExecutionModeType::GENERIC; + std::string ExecModeNameStr(E->name); + ExecModeNameStr += "_exec_mode"; + const char *ExecModeName = ExecModeNameStr.c_str(); + + CUdeviceptr ExecModePtr; + size_t CUSize; + Err = cuModuleGetGlobal(&ExecModePtr, &CUSize, Module, ExecModeName); + if (Err == CUDA_SUCCESS) { + if (CUSize != sizeof(int8_t)) { + DP("Loading global exec_mode '%s' - size mismatch (%zd != %zd)\n", + ExecModeName, CUSize, sizeof(int8_t)); + return nullptr; + } + + Err = cuMemcpyDtoH(&ExecModeVal, ExecModePtr, CUSize); + if (Err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zd\n", + DPxPTR(&ExecModeVal), DPxPTR(ExecModePtr), CUSize); + CUDA_ERR_STRING(Err); + return nullptr; + } + + if (ExecModeVal < 0 || ExecModeVal > 1) { + DP("Error wrong exec_mode value specified in cubin file: %d\n", + ExecModeVal); + return nullptr; + } + } else { + DP("Loading global exec_mode '%s' - symbol missing, using default " + "value GENERIC (1)\n", + ExecModeName); + CUDA_ERR_STRING(Err); + } + + KernelsList.emplace_back(Func, ExecModeVal); + + __tgt_offload_entry Entry = *E; + Entry.addr = &KernelsList.back(); + addOffloadEntry(DeviceId, Entry); + } + + // send device environment data to the device + { + omptarget_device_environmentTy DeviceEnv{0}; + +#ifdef OMPTARGET_DEBUG + if (const char *EnvStr = getenv("LIBOMPTARGET_DEVICE_RTL_DEBUG")) + DeviceEnv.debug_level = std::stoi(EnvStr); +#endif + + const char *DeviceEnvName = "omptarget_device_environment"; + CUdeviceptr DeviceEnvPtr; + size_t CUSize; + + Err = cuModuleGetGlobal(&DeviceEnvPtr, &CUSize, Module, DeviceEnvName); + if (Err == CUDA_SUCCESS) { + if (CUSize != sizeof(DeviceEnv)) { + DP("Global device_environment '%s' - size mismatch (%zu != %zu)\n", + DeviceEnvName, CUSize, sizeof(int32_t)); + CUDA_ERR_STRING(Err); + return nullptr; + } + + Err = cuMemcpyHtoD(DeviceEnvPtr, &DeviceEnv, CUSize); + if (Err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: " + "host = " DPxMOD ", device = " DPxMOD ", size = %zu\n", + DPxPTR(&DeviceEnv), DPxPTR(DeviceEnvPtr), CUSize); + CUDA_ERR_STRING(Err); + return nullptr; + } + + DP("Sending global device environment data %zu bytes\n", CUSize); + } else { + DP("Finding global device environment '%s' - symbol missing.\n", + DeviceEnvName); + DP("Continue, considering this is a device RTL which does not accept " + "environment setting.\n"); + } + } + + return getOffloadEntriesTable(DeviceId); + } + + void *dataAlloc(const int DeviceId, const int64_t Size) const { + if (Size == 0) + return nullptr; + + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return nullptr; + + CUdeviceptr DevicePtr; + Err = cuMemAlloc(&DevicePtr, Size); + if (!checkResult(Err, "Error returned from cuMemAlloc\n")) + return nullptr; + + return (void *)DevicePtr; + } + + int dataSubmit(const int DeviceId, const void *TgtPtr, const void *HstPtr, + const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + CUstream Stream = getStream(DeviceId, AsyncInfoPtr); + + Err = cuMemcpyHtoDAsync((CUdeviceptr)TgtPtr, HstPtr, Size, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when copying data from host to device. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } + + int dataRetrieve(const int DeviceId, void *HstPtr, const void *TgtPtr, + const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + CUstream Stream = getStream(DeviceId, AsyncInfoPtr); + + Err = cuMemcpyDtoHAsync(HstPtr, (CUdeviceptr)TgtPtr, Size, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when copying data from device to host. Pointers: host = " DPxMOD + ", device = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(HstPtr), DPxPTR(TgtPtr), Size); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } + + int dataTransfer(const int DeviceId, void *DstPtr, const void *SrcPtr, + const int64_t Size, __tgt_async_info *AsyncInfoPtr) const { + assert(AsyncInfoPtr && "AsyncInfoPtr is nullptr"); + + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + CUstream Stream = getStream(DeviceId, AsyncInfoPtr); + + Err = cuMemcpyDtoDAsync((CUdeviceptr)DstPtr, (CUdeviceptr)SrcPtr, Size, Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when copying data from device to device. Pointers: dst = " DPxMOD + ", src = " DPxMOD ", size = %" PRId64 "\n", + DPxPTR(DstPtr), DPxPTR(SrcPtr), Size); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; + } + + + int dataDelete(const int DeviceId, void *TgtPtr) const { + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + Err = cuMemFree((CUdeviceptr)TgtPtr); + if (!checkResult(Err, "Error returned from cuMemFree\n")) + return OFFLOAD_FAIL; + + return OFFLOAD_SUCCESS; + } + + int runTargetTeamRegion(const int DeviceId, const void *TgtEntryPtr, + void **TgtArgs, ptrdiff_t *TgtOffsets, + const int ArgNum, const int TeamNum, + const int ThreadLimit, + const unsigned int LoopTripCount, + __tgt_async_info *AsyncInfo) const { + CUresult Err = cuCtxSetCurrent(DeviceData[DeviceId].Context); + if (!checkResult(Err, "Error returned from cuCtxSetCurrent\n")) + return OFFLOAD_FAIL; + + // All args are references. + std::vector Args(ArgNum); + std::vector Ptrs(ArgNum); + + for (int I = 0; I < ArgNum; ++I) { + Ptrs[I] = (void *)((intptr_t)TgtArgs[I] + TgtOffsets[I]); + Args[I] = &Ptrs[I]; + } + + const KernelTy *KernelInfo = + reinterpret_cast(TgtEntryPtr); + + unsigned int CudaThreadsPerBlock; + if (ThreadLimit > 0) { + DP("Setting CUDA threads per block to requested %d\n", ThreadLimit); + CudaThreadsPerBlock = ThreadLimit; + // Add master warp if necessary + if (KernelInfo->ExecutionMode == GENERIC) { + DP("Adding master warp: +%d threads\n", DeviceData[DeviceId].WarpSize); + CudaThreadsPerBlock += DeviceData[DeviceId].WarpSize; + } + } else { + DP("Setting CUDA threads per block to default %d\n", + DeviceData[DeviceId].NumThreads); + CudaThreadsPerBlock = DeviceData[DeviceId].NumThreads; + } + + if (CudaThreadsPerBlock > DeviceData[DeviceId].ThreadsPerBlock) { + DP("Threads per block capped at device limit %d\n", + DeviceData[DeviceId].ThreadsPerBlock); + CudaThreadsPerBlock = DeviceData[DeviceId].ThreadsPerBlock; + } + + int KernelLimit; + Err = cuFuncGetAttribute(&KernelLimit, + CU_FUNC_ATTRIBUTE_MAX_THREADS_PER_BLOCK, + KernelInfo->Func); + if (Err == CUDA_SUCCESS && KernelLimit < CudaThreadsPerBlock) { + DP("Threads per block capped at kernel limit %d\n", KernelLimit); + CudaThreadsPerBlock = KernelLimit; + } + + unsigned int CudaBlocksPerGrid; + if (TeamNum <= 0) { + if (LoopTripCount > 0 && EnvNumTeams < 0) { + if (KernelInfo->ExecutionMode == SPMD) { + // We have a combined construct, i.e. `target teams distribute + // parallel for [simd]`. We launch so many teams so that each thread + // will execute one iteration of the loop. round up to the nearest + // integer + CudaBlocksPerGrid = ((LoopTripCount - 1) / CudaThreadsPerBlock) + 1; + } else { + // If we reach this point, then we have a non-combined construct, i.e. + // `teams distribute` with a nested `parallel for` and each team is + // assigned one iteration of the `distribute` loop. E.g.: + // + // #pragma omp target teams distribute + // for(...loop_tripcount...) { + // #pragma omp parallel for + // for(...) {} + // } + // + // Threads within a team will execute the iterations of the `parallel` + // loop. + CudaBlocksPerGrid = LoopTripCount; + } + DP("Using %d teams due to loop trip count %" PRIu64 + " and number of threads per block %d\n", + CudaBlocksPerGrid, LoopTripCount, CudaThreadsPerBlock); + } else { + DP("Using default number of teams %d\n", DeviceData[DeviceId].NumTeams); + CudaBlocksPerGrid = DeviceData[DeviceId].NumTeams; + } + } else if (TeamNum > DeviceData[DeviceId].BlocksPerGrid) { + DP("Capping number of teams to team limit %d\n", + DeviceData[DeviceId].BlocksPerGrid); + CudaBlocksPerGrid = DeviceData[DeviceId].BlocksPerGrid; + } else { + DP("Using requested number of teams %d\n", TeamNum); + CudaBlocksPerGrid = TeamNum; + } + + // Run on the device. + DP("Launch kernel with %d blocks and %d threads\n", CudaBlocksPerGrid, + CudaThreadsPerBlock); + + CUstream Stream = getStream(DeviceId, AsyncInfo); + Err = cuLaunchKernel(KernelInfo->Func, CudaBlocksPerGrid, /* gridDimY */ 1, + /* gridDimZ */ 1, CudaThreadsPerBlock, + /* blockDimY */ 1, /* blockDimZ */ 1, + /* sharedMemBytes */ 0, Stream, &Args[0], nullptr); + if (!checkResult(Err, "Error returned from cuLaunchKernel\n")) + return OFFLOAD_FAIL; + + DP("Launch of entry point at " DPxMOD " successful!\n", + DPxPTR(TgtEntryPtr)); + + return OFFLOAD_SUCCESS; + } + + int synchronize(const int DeviceId, __tgt_async_info *AsyncInfoPtr) const { + CUstream Stream = reinterpret_cast(AsyncInfoPtr->Queue); + CUresult Err = cuStreamSynchronize(Stream); + if (Err != CUDA_SUCCESS) { + DP("Error when synchronizing stream. stream = " DPxMOD + ", async info ptr = " DPxMOD "\n", + DPxPTR(Stream), DPxPTR(AsyncInfoPtr)); + CUDA_ERR_STRING(Err); + return OFFLOAD_FAIL; + } + + // Once the stream is synchronized, return it to stream pool and reset + // async_info. This is to make sure the synchronization only works for its + // own tasks. + StreamManager->returnStream( + DeviceId, reinterpret_cast(AsyncInfoPtr->Queue)); + AsyncInfoPtr->Queue = nullptr; + + return OFFLOAD_SUCCESS; + } +}; + +DeviceRTLTy DeviceRTL; +} // namespace + +// Exposed library API function +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { + return elf_check_machine(image, /* EM_CUDA */ 190); +} + +int32_t __tgt_rtl_number_of_devices() { return DeviceRTL.getNumOfDevices(); } + +int64_t __tgt_rtl_init_requires(int64_t RequiresFlags) { + DP("Init requires flags to %ld\n", RequiresFlags); + DeviceRTL.setRequiresFlag(RequiresFlags); + return RequiresFlags; +} + +int32_t __tgt_rtl_init_device(int32_t device_id) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + return DeviceRTL.initDevice(device_id); +} + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + return DeviceRTL.loadBinary(device_id, image); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + return DeviceRTL.dataAlloc(device_id, size); +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + __tgt_async_info async_info; + const int32_t rc = __tgt_rtl_data_submit_async(device_id, tgt_ptr, hst_ptr, + size, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_data_submit_async(int32_t device_id, void *tgt_ptr, + void *hst_ptr, int64_t size, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + + return DeviceRTL.dataSubmit(device_id, tgt_ptr, hst_ptr, size, + async_info_ptr); +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + __tgt_async_info async_info; + const int32_t rc = __tgt_rtl_data_retrieve_async(device_id, hst_ptr, tgt_ptr, + size, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_data_retrieve_async(int32_t device_id, void *hst_ptr, + void *tgt_ptr, int64_t size, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + + return DeviceRTL.dataRetrieve(device_id, hst_ptr, tgt_ptr, size, + async_info_ptr); +} + +int32_t __tgt_rtl_data_transfer(int32_t device_id, void *dst_ptr, void *src_ptr, + int64_t size) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + __tgt_async_info async_info; + const int32_t rc = __tgt_rtl_data_transfer_async(device_id, hst_ptr, tgt_ptr, + size, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_data_transfer_async(int32_t device_id, void *dst_ptr, + void *src_ptr, int64_t size, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + + return DeviceRTL.dataTransfer(device_id, dst_ptr, src_ptr, size, + async_info_ptr); +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + return DeviceRTL.dataDelete(device_id, tgt_ptr); +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + __tgt_async_info async_info; + const int32_t rc = __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, loop_tripcount, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_run_target_team_region_async( + int32_t device_id, void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, int32_t arg_num, int32_t team_num, + int32_t thread_limit, uint64_t loop_tripcount, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + return DeviceRTL.runTargetTeamRegion( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, team_num, + thread_limit, loop_tripcount, async_info_ptr); +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + __tgt_async_info async_info; + const int32_t rc = __tgt_rtl_run_target_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, &async_info); + if (rc != OFFLOAD_SUCCESS) + return OFFLOAD_FAIL; + + return __tgt_rtl_synchronize(device_id, &async_info); +} + +int32_t __tgt_rtl_run_target_region_async(int32_t device_id, + void *tgt_entry_ptr, void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + + return __tgt_rtl_run_target_team_region_async( + device_id, tgt_entry_ptr, tgt_args, tgt_offsets, arg_num, + /* team num*/ 1, /* thread_limit */ 1, /* loop_tripcount */ 0, + async_info_ptr); +} + +int32_t __tgt_rtl_synchronize(int32_t device_id, + __tgt_async_info *async_info_ptr) { + assert(DeviceRTL.isValidDeviceId(device_id) && "device_id is invalid"); + assert(async_info_ptr && "async_info_ptr is nullptr"); + assert(async_info_ptr->Queue && "async_info_ptr->Queue is nullptr"); + + return DeviceRTL.synchronize(device_id, async_info_ptr); +} + +#ifdef __cplusplus +} +#endif diff --git a/openmp/libomptarget/plugins/exports b/openmp/libomptarget/plugins/exports index a4e1a3186daa5..67d689e2f3285 100644 --- a/openmp/libomptarget/plugins/exports +++ b/openmp/libomptarget/plugins/exports @@ -1,21 +1,23 @@ -VERS1.0 { - global: - __tgt_rtl_is_valid_binary; - __tgt_rtl_number_of_devices; - __tgt_rtl_init_requires; - __tgt_rtl_init_device; - __tgt_rtl_load_binary; - __tgt_rtl_data_alloc; - __tgt_rtl_data_submit; - __tgt_rtl_data_submit_async; - __tgt_rtl_data_retrieve; - __tgt_rtl_data_retrieve_async; - __tgt_rtl_data_delete; - __tgt_rtl_run_target_team_region; - __tgt_rtl_run_target_team_region_async; - __tgt_rtl_run_target_region; - __tgt_rtl_run_target_region_async; - __tgt_rtl_synchronize; - local: - *; -}; +VERS1.0 { + global: + __tgt_rtl_is_valid_binary; + __tgt_rtl_number_of_devices; + __tgt_rtl_init_requires; + __tgt_rtl_init_device; + __tgt_rtl_load_binary; + __tgt_rtl_data_alloc; + __tgt_rtl_data_submit; + __tgt_rtl_data_submit_async; + __tgt_rtl_data_retrieve; + __tgt_rtl_data_retrieve_async; + __tgt_rtl_data_transfer; + __tgt_rtl_data_transfer_async; + __tgt_rtl_data_delete; + __tgt_rtl_run_target_team_region; + __tgt_rtl_run_target_team_region_async; + __tgt_rtl_run_target_region; + __tgt_rtl_run_target_region_async; + __tgt_rtl_synchronize; + local: + *; +}; diff --git a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp index 8a6e085d3f75c..a2d0de38c74fe 100644 --- a/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp +++ b/openmp/libomptarget/plugins/generic-elf-64bit/src/rtl.cpp @@ -1,343 +1,349 @@ -//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// RTL for generic 64-bit machine -// -//===----------------------------------------------------------------------===// - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "omptargetplugin.h" - -#ifndef TARGET_NAME -#define TARGET_NAME Generic ELF - 64bit -#endif - -#ifndef TARGET_ELF_ID -#define TARGET_ELF_ID 0 -#endif - -#ifdef OMPTARGET_DEBUG -static int DebugLevel = 0; - -#define GETNAME2(name) #name -#define GETNAME(name) GETNAME2(name) -#define DP(...) \ - do { \ - if (DebugLevel > 0) { \ - DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ - } \ - } while (false) -#else // OMPTARGET_DEBUG -#define DP(...) {} -#endif // OMPTARGET_DEBUG - -#include "../../common/elf_common.c" - -#define NUMBER_OF_DEVICES 4 -#define OFFLOADSECTIONNAME "omp_offloading_entries" - -/// Array of Dynamic libraries loaded for this target. -struct DynLibTy { - char *FileName; - void *Handle; -}; - -/// Keep entries table per device. -struct FuncOrGblEntryTy { - __tgt_target_table Table; -}; - -/// Class containing all the device information. -class RTLDeviceInfoTy { - std::vector> FuncGblEntries; - -public: - std::list DynLibs; - - // Record entry point associated with device. - void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, - __tgt_offload_entry *end) { - assert(device_id < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncGblEntries[device_id].emplace_back(); - FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); - - E.Table.EntriesBegin = begin; - E.Table.EntriesEnd = end; - } - - // Return true if the entry is associated with device. - bool findOffloadEntry(int32_t device_id, void *addr) { - assert(device_id < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); - - for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; - i < e; ++i) { - if (i->addr == addr) - return true; - } - - return false; - } - - // Return the pointer to the target entries table. - __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { - assert(device_id < (int32_t)FuncGblEntries.size() && - "Unexpected device id!"); - FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); - - return &E.Table; - } - - RTLDeviceInfoTy(int32_t num_devices) { -#ifdef OMPTARGET_DEBUG - if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { - DebugLevel = std::stoi(envStr); - } -#endif // OMPTARGET_DEBUG - - FuncGblEntries.resize(num_devices); - } - - ~RTLDeviceInfoTy() { - // Close dynamic libraries - for (auto &lib : DynLibs) { - if (lib.Handle) { - dlclose(lib.Handle); - remove(lib.FileName); - } - } - } -}; - -static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); - -#ifdef __cplusplus -extern "C" { -#endif - -int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { -// If we don't have a valid ELF ID we can just fail. -#if TARGET_ELF_ID < 1 - return 0; -#else - return elf_check_machine(image, TARGET_ELF_ID); -#endif -} - -int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } - -int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } - -__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, - __tgt_device_image *image) { - - DP("Dev %d: load binary from " DPxMOD " image\n", device_id, - DPxPTR(image->ImageStart)); - - assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); - - size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; - size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); - DP("Expecting to have %zd entries defined.\n", NumEntries); - - // Is the library version incompatible with the header file? - if (elf_version(EV_CURRENT) == EV_NONE) { - DP("Incompatible ELF library!\n"); - return NULL; - } - - // Obtain elf handler - Elf *e = elf_memory((char *)image->ImageStart, ImageSize); - if (!e) { - DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); - return NULL; - } - - if (elf_kind(e) != ELF_K_ELF) { - DP("Invalid Elf kind!\n"); - elf_end(e); - return NULL; - } - - // Find the entries section offset - Elf_Scn *section = 0; - Elf64_Off entries_offset = 0; - - size_t shstrndx; - - if (elf_getshdrstrndx(e, &shstrndx)) { - DP("Unable to get ELF strings index!\n"); - elf_end(e); - return NULL; - } - - while ((section = elf_nextscn(e, section))) { - GElf_Shdr hdr; - gelf_getshdr(section, &hdr); - - if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { - entries_offset = hdr.sh_addr; - break; - } - } - - if (!entries_offset) { - DP("Entries Section Offset Not Found\n"); - elf_end(e); - return NULL; - } - - DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); - - // load dynamic library and get the entry points. We use the dl library - // to do the loading of the library, but we could do it directly to avoid the - // dump to the temporary file. - // - // 1) Create tmp file with the library contents. - // 2) Use dlopen to load the file and dlsym to retrieve the symbols. - char tmp_name[] = "/tmp/tmpfile_XXXXXX"; - int tmp_fd = mkstemp(tmp_name); - - if (tmp_fd == -1) { - elf_end(e); - return NULL; - } - - FILE *ftmp = fdopen(tmp_fd, "wb"); - - if (!ftmp) { - elf_end(e); - return NULL; - } - - fwrite(image->ImageStart, ImageSize, 1, ftmp); - fclose(ftmp); - - DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)}; - - if (!Lib.Handle) { - DP("Target library loading error: %s\n", dlerror()); - elf_end(e); - return NULL; - } - - DeviceInfo.DynLibs.push_back(Lib); - - struct link_map *libInfo = (struct link_map *)Lib.Handle; - - // The place where the entries info is loaded is the library base address - // plus the offset determined from the ELF file. - Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; - - DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", - DPxPTR(entries_addr)); - - // Table of pointers to all the entries in the target. - __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; - - __tgt_offload_entry *entries_begin = &entries_table[0]; - __tgt_offload_entry *entries_end = entries_begin + NumEntries; - - if (!entries_begin) { - DP("Can't obtain entries begin\n"); - elf_end(e); - return NULL; - } - - DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", - DPxPTR(entries_begin), DPxPTR(entries_end)); - DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); - - elf_end(e); - - return DeviceInfo.getOffloadEntriesTable(device_id); -} - -void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { - void *ptr = malloc(size); - return ptr; -} - -int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, - int64_t size) { - memcpy(tgt_ptr, hst_ptr, size); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, - int64_t size) { - memcpy(hst_ptr, tgt_ptr, size); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { - free(tgt_ptr); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, - ptrdiff_t *tgt_offsets, - int32_t arg_num, int32_t team_num, - int32_t thread_limit, - uint64_t loop_tripcount /*not used*/) { - // ignore team num and thread limit. - - // Use libffi to launch execution. - ffi_cif cif; - - // All args are references. - std::vector args_types(arg_num, &ffi_type_pointer); - std::vector args(arg_num); - std::vector ptrs(arg_num); - - for (int32_t i = 0; i < arg_num; ++i) { - ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); - args[i] = &ptrs[i]; - } - - ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, - &ffi_type_void, &args_types[0]); - - assert(status == FFI_OK && "Unable to prepare target launch!"); - - if (status != FFI_OK) - return OFFLOAD_FAIL; - - DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); - - void (*entry)(void); - *((void**) &entry) = tgt_entry_ptr; - ffi_call(&cif, entry, NULL, &args[0]); - return OFFLOAD_SUCCESS; -} - -int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, - void **tgt_args, ptrdiff_t *tgt_offsets, - int32_t arg_num) { - // use one team and one thread. - return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, - tgt_offsets, arg_num, 1, 1, 0); -} - -#ifdef __cplusplus -} -#endif +//===-RTLs/generic-64bit/src/rtl.cpp - Target RTLs Implementation - C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// RTL for generic 64-bit machine +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "omptargetplugin.h" + +#ifndef TARGET_NAME +#define TARGET_NAME Generic ELF - 64bit +#endif + +#ifndef TARGET_ELF_ID +#define TARGET_ELF_ID 0 +#endif + +#ifdef OMPTARGET_DEBUG +static int DebugLevel = 0; + +#define GETNAME2(name) #name +#define GETNAME(name) GETNAME2(name) +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Target " GETNAME(TARGET_NAME) " RTL", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#endif // OMPTARGET_DEBUG + +#include "../../common/elf_common.c" + +#define NUMBER_OF_DEVICES 4 +#define OFFLOADSECTIONNAME "omp_offloading_entries" + +/// Array of Dynamic libraries loaded for this target. +struct DynLibTy { + char *FileName; + void *Handle; +}; + +/// Keep entries table per device. +struct FuncOrGblEntryTy { + __tgt_target_table Table; +}; + +/// Class containing all the device information. +class RTLDeviceInfoTy { + std::vector> FuncGblEntries; + +public: + std::list DynLibs; + + // Record entry point associated with device. + void createOffloadTable(int32_t device_id, __tgt_offload_entry *begin, + __tgt_offload_entry *end) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncGblEntries[device_id].emplace_back(); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + E.Table.EntriesBegin = begin; + E.Table.EntriesEnd = end; + } + + // Return true if the entry is associated with device. + bool findOffloadEntry(int32_t device_id, void *addr) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + for (__tgt_offload_entry *i = E.Table.EntriesBegin, *e = E.Table.EntriesEnd; + i < e; ++i) { + if (i->addr == addr) + return true; + } + + return false; + } + + // Return the pointer to the target entries table. + __tgt_target_table *getOffloadEntriesTable(int32_t device_id) { + assert(device_id < (int32_t)FuncGblEntries.size() && + "Unexpected device id!"); + FuncOrGblEntryTy &E = FuncGblEntries[device_id].back(); + + return &E.Table; + } + + RTLDeviceInfoTy(int32_t num_devices) { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + FuncGblEntries.resize(num_devices); + } + + ~RTLDeviceInfoTy() { + // Close dynamic libraries + for (auto &lib : DynLibs) { + if (lib.Handle) { + dlclose(lib.Handle); + remove(lib.FileName); + } + } + } +}; + +static RTLDeviceInfoTy DeviceInfo(NUMBER_OF_DEVICES); + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t __tgt_rtl_is_valid_binary(__tgt_device_image *image) { +// If we don't have a valid ELF ID we can just fail. +#if TARGET_ELF_ID < 1 + return 0; +#else + return elf_check_machine(image, TARGET_ELF_ID); +#endif +} + +int32_t __tgt_rtl_number_of_devices() { return NUMBER_OF_DEVICES; } + +int32_t __tgt_rtl_init_device(int32_t device_id) { return OFFLOAD_SUCCESS; } + +__tgt_target_table *__tgt_rtl_load_binary(int32_t device_id, + __tgt_device_image *image) { + + DP("Dev %d: load binary from " DPxMOD " image\n", device_id, + DPxPTR(image->ImageStart)); + + assert(device_id >= 0 && device_id < NUMBER_OF_DEVICES && "bad dev id"); + + size_t ImageSize = (size_t)image->ImageEnd - (size_t)image->ImageStart; + size_t NumEntries = (size_t)(image->EntriesEnd - image->EntriesBegin); + DP("Expecting to have %zd entries defined.\n", NumEntries); + + // Is the library version incompatible with the header file? + if (elf_version(EV_CURRENT) == EV_NONE) { + DP("Incompatible ELF library!\n"); + return NULL; + } + + // Obtain elf handler + Elf *e = elf_memory((char *)image->ImageStart, ImageSize); + if (!e) { + DP("Unable to get ELF handle: %s!\n", elf_errmsg(-1)); + return NULL; + } + + if (elf_kind(e) != ELF_K_ELF) { + DP("Invalid Elf kind!\n"); + elf_end(e); + return NULL; + } + + // Find the entries section offset + Elf_Scn *section = 0; + Elf64_Off entries_offset = 0; + + size_t shstrndx; + + if (elf_getshdrstrndx(e, &shstrndx)) { + DP("Unable to get ELF strings index!\n"); + elf_end(e); + return NULL; + } + + while ((section = elf_nextscn(e, section))) { + GElf_Shdr hdr; + gelf_getshdr(section, &hdr); + + if (!strcmp(elf_strptr(e, shstrndx, hdr.sh_name), OFFLOADSECTIONNAME)) { + entries_offset = hdr.sh_addr; + break; + } + } + + if (!entries_offset) { + DP("Entries Section Offset Not Found\n"); + elf_end(e); + return NULL; + } + + DP("Offset of entries section is (" DPxMOD ").\n", DPxPTR(entries_offset)); + + // load dynamic library and get the entry points. We use the dl library + // to do the loading of the library, but we could do it directly to avoid the + // dump to the temporary file. + // + // 1) Create tmp file with the library contents. + // 2) Use dlopen to load the file and dlsym to retrieve the symbols. + char tmp_name[] = "/tmp/tmpfile_XXXXXX"; + int tmp_fd = mkstemp(tmp_name); + + if (tmp_fd == -1) { + elf_end(e); + return NULL; + } + + FILE *ftmp = fdopen(tmp_fd, "wb"); + + if (!ftmp) { + elf_end(e); + return NULL; + } + + fwrite(image->ImageStart, ImageSize, 1, ftmp); + fclose(ftmp); + + DynLibTy Lib = {tmp_name, dlopen(tmp_name, RTLD_LAZY)}; + + if (!Lib.Handle) { + DP("Target library loading error: %s\n", dlerror()); + elf_end(e); + return NULL; + } + + DeviceInfo.DynLibs.push_back(Lib); + + struct link_map *libInfo = (struct link_map *)Lib.Handle; + + // The place where the entries info is loaded is the library base address + // plus the offset determined from the ELF file. + Elf64_Addr entries_addr = libInfo->l_addr + entries_offset; + + DP("Pointer to first entry to be loaded is (" DPxMOD ").\n", + DPxPTR(entries_addr)); + + // Table of pointers to all the entries in the target. + __tgt_offload_entry *entries_table = (__tgt_offload_entry *)entries_addr; + + __tgt_offload_entry *entries_begin = &entries_table[0]; + __tgt_offload_entry *entries_end = entries_begin + NumEntries; + + if (!entries_begin) { + DP("Can't obtain entries begin\n"); + elf_end(e); + return NULL; + } + + DP("Entries table range is (" DPxMOD ")->(" DPxMOD ")\n", + DPxPTR(entries_begin), DPxPTR(entries_end)); + DeviceInfo.createOffloadTable(device_id, entries_begin, entries_end); + + elf_end(e); + + return DeviceInfo.getOffloadEntriesTable(device_id); +} + +void *__tgt_rtl_data_alloc(int32_t device_id, int64_t size, void *hst_ptr) { + void *ptr = malloc(size); + return ptr; +} + +int32_t __tgt_rtl_data_submit(int32_t device_id, void *tgt_ptr, void *hst_ptr, + int64_t size) { + memcpy(tgt_ptr, hst_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_retrieve(int32_t device_id, void *hst_ptr, void *tgt_ptr, + int64_t size) { + memcpy(hst_ptr, tgt_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_transfer(int32_t device_id, void *dst_ptr, void *src_ptr, + int64_t size) { + memcpy(dst_ptr, src_ptr, size); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_data_delete(int32_t device_id, void *tgt_ptr) { + free(tgt_ptr); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_team_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, + ptrdiff_t *tgt_offsets, + int32_t arg_num, int32_t team_num, + int32_t thread_limit, + uint64_t loop_tripcount /*not used*/) { + // ignore team num and thread limit. + + // Use libffi to launch execution. + ffi_cif cif; + + // All args are references. + std::vector args_types(arg_num, &ffi_type_pointer); + std::vector args(arg_num); + std::vector ptrs(arg_num); + + for (int32_t i = 0; i < arg_num; ++i) { + ptrs[i] = (void *)((intptr_t)tgt_args[i] + tgt_offsets[i]); + args[i] = &ptrs[i]; + } + + ffi_status status = ffi_prep_cif(&cif, FFI_DEFAULT_ABI, arg_num, + &ffi_type_void, &args_types[0]); + + assert(status == FFI_OK && "Unable to prepare target launch!"); + + if (status != FFI_OK) + return OFFLOAD_FAIL; + + DP("Running entry point at " DPxMOD "...\n", DPxPTR(tgt_entry_ptr)); + + void (*entry)(void); + *((void**) &entry) = tgt_entry_ptr; + ffi_call(&cif, entry, NULL, &args[0]); + return OFFLOAD_SUCCESS; +} + +int32_t __tgt_rtl_run_target_region(int32_t device_id, void *tgt_entry_ptr, + void **tgt_args, ptrdiff_t *tgt_offsets, + int32_t arg_num) { + // use one team and one thread. + return __tgt_rtl_run_target_team_region(device_id, tgt_entry_ptr, tgt_args, + tgt_offsets, arg_num, 1, 1, 0); +} + +#ifdef __cplusplus +} +#endif diff --git a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt index 3915196453e0a..ffa684732ba28 100644 --- a/openmp/libomptarget/plugins/ppc64/CMakeLists.txt +++ b/openmp/libomptarget/plugins/ppc64/CMakeLists.txt @@ -1,17 +1,17 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a ppc64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") -else() - libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64" "PPC64" "ppc64" "powerpc64-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64 offloading plugin: machine not found in the system.") endif() \ No newline at end of file diff --git a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt index 0cfe7c0051fa4..e5d8cffe4aad7 100644 --- a/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt +++ b/openmp/libomptarget/plugins/ppc64le/CMakeLists.txt @@ -1,17 +1,17 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a ppc64le machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") -else() - libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a ppc64le machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("ppc64le" "PPC64le" "ppc64" "powerpc64le-ibm-linux-gnu" "21") +else() + libomptarget_say("Not building ppc64le offloading plugin: machine not found in the system.") endif() \ No newline at end of file diff --git a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt index f61e1e856c80d..33e9bb373cb01 100644 --- a/openmp/libomptarget/plugins/x86_64/CMakeLists.txt +++ b/openmp/libomptarget/plugins/x86_64/CMakeLists.txt @@ -1,17 +1,17 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build a plugin for a x86_64 machine if available. -# -##===----------------------------------------------------------------------===## - -if(CMAKE_SYSTEM_NAME MATCHES "Linux") - build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") -else() - libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build a plugin for a x86_64 machine if available. +# +##===----------------------------------------------------------------------===## + +if(CMAKE_SYSTEM_NAME MATCHES "Linux") + build_generic_elf64("x86_64" "x86_64" "x86_64" "x86_64-pc-linux-gnu" "62") +else() + libomptarget_say("Not building x86_64 offloading plugin: machine not found in the system.") endif() \ No newline at end of file diff --git a/openmp/libomptarget/src/CMakeLists.txt b/openmp/libomptarget/src/CMakeLists.txt index f30087ed43423..e534619ff5100 100644 --- a/openmp/libomptarget/src/CMakeLists.txt +++ b/openmp/libomptarget/src/CMakeLists.txt @@ -1,31 +1,31 @@ -##===----------------------------------------------------------------------===## -# -# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -# -##===----------------------------------------------------------------------===## -# -# Build offloading library libomptarget.so. -# -##===----------------------------------------------------------------------===## - -libomptarget_say("Building offloading runtime library libomptarget.") - -set(src_files - api.cpp - device.cpp - interface.cpp - rtl.cpp - omptarget.cpp -) - -# Build libomptarget library with libdl dependency. -add_library(omptarget SHARED ${src_files}) -target_link_libraries(omptarget - ${CMAKE_DL_LIBS} - "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") - -# Install libomptarget under the lib destination folder. -install(TARGETS omptarget LIBRARY COMPONENT omptarget - DESTINATION "${OPENMP_INSTALL_LIBDIR}") +##===----------------------------------------------------------------------===## +# +# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +##===----------------------------------------------------------------------===## +# +# Build offloading library libomptarget.so. +# +##===----------------------------------------------------------------------===## + +libomptarget_say("Building offloading runtime library libomptarget.") + +set(src_files + api.cpp + device.cpp + interface.cpp + rtl.cpp + omptarget.cpp +) + +# Build libomptarget library with libdl dependency. +add_library(omptarget SHARED ${src_files}) +target_link_libraries(omptarget + ${CMAKE_DL_LIBS} + "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/exports") + +# Install libomptarget under the lib destination folder. +install(TARGETS omptarget LIBRARY COMPONENT omptarget + DESTINATION "${OPENMP_INSTALL_LIBDIR}") diff --git a/openmp/libomptarget/src/api.cpp b/openmp/libomptarget/src/api.cpp index 3c7b709fb894e..4d6b1d185b147 100644 --- a/openmp/libomptarget/src/api.cpp +++ b/openmp/libomptarget/src/api.cpp @@ -1,291 +1,295 @@ -//===----------- api.cpp - Target independent OpenMP target RTL -----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implementation of OpenMP API interface functions. -// -//===----------------------------------------------------------------------===// - -#include - -#include "device.h" -#include "private.h" -#include "rtl.h" - -#include -#include -#include - -EXTERN int omp_get_num_devices(void) { - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); - - DP("Call to omp_get_num_devices returning %zd\n", Devices_size); - - return Devices_size; -} - -EXTERN int omp_get_initial_device(void) { - DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE); - return HOST_DEVICE; -} - -EXTERN void *omp_target_alloc(size_t size, int device_num) { - DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", - device_num, size); - - if (size <= 0) { - DP("Call to omp_target_alloc with non-positive length\n"); - return NULL; - } - - void *rc = NULL; - - if (device_num == omp_get_initial_device()) { - rc = malloc(size); - DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc)); - return rc; - } - - if (!device_is_ready(device_num)) { - DP("omp_target_alloc returns NULL ptr\n"); - return NULL; - } - - DeviceTy &Device = Devices[device_num]; - rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL); - DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); - return rc; -} - -EXTERN void omp_target_free(void *device_ptr, int device_num) { - DP("Call to omp_target_free for device %d and address " DPxMOD "\n", - device_num, DPxPTR(device_ptr)); - - if (!device_ptr) { - DP("Call to omp_target_free with NULL ptr\n"); - return; - } - - if (device_num == omp_get_initial_device()) { - free(device_ptr); - DP("omp_target_free deallocated host ptr\n"); - return; - } - - if (!device_is_ready(device_num)) { - DP("omp_target_free returns, nothing to do\n"); - return; - } - - DeviceTy &Device = Devices[device_num]; - Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr); - DP("omp_target_free deallocated device ptr\n"); -} - -EXTERN int omp_target_is_present(void *ptr, int device_num) { - DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", - device_num, DPxPTR(ptr)); - - if (!ptr) { - DP("Call to omp_target_is_present with NULL ptr, returning false\n"); - return false; - } - - if (device_num == omp_get_initial_device()) { - DP("Call to omp_target_is_present on host, returning true\n"); - return true; - } - - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); - if (Devices_size <= (size_t)device_num) { - DP("Call to omp_target_is_present with invalid device ID, returning " - "false\n"); - return false; - } - - DeviceTy& Device = Devices[device_num]; - bool IsLast; // not used - bool IsHostPtr; - void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr); - int rc = (TgtPtr != NULL); - // Under unified memory the host pointer can be returned by the - // getTgtPtrBegin() function which means that there is no device - // corresponding point for ptr. This function should return false - // in that situation. - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) - rc = !IsHostPtr; - DP("Call to omp_target_is_present returns %d\n", rc); - return rc; -} - -EXTERN int omp_target_memcpy(void *dst, void *src, size_t length, - size_t dst_offset, size_t src_offset, int dst_device, int src_device) { - DP("Call to omp_target_memcpy, dst device %d, src device %d, " - "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " - "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst), - DPxPTR(src), dst_offset, src_offset, length); - - if (!dst || !src || length <= 0) { - DP("Call to omp_target_memcpy with invalid arguments\n"); - return OFFLOAD_FAIL; - } - - if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) { - DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); - return OFFLOAD_FAIL; - } - - if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) { - DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); - return OFFLOAD_FAIL; - } - - int rc = OFFLOAD_SUCCESS; - void *srcAddr = (char *)src + src_offset; - void *dstAddr = (char *)dst + dst_offset; - - if (src_device == omp_get_initial_device() && - dst_device == omp_get_initial_device()) { - DP("copy from host to host\n"); - const void *p = memcpy(dstAddr, srcAddr, length); - if (p == NULL) - rc = OFFLOAD_FAIL; - } else if (src_device == omp_get_initial_device()) { - DP("copy from host to device\n"); - DeviceTy& DstDev = Devices[dst_device]; - rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr); - } else if (dst_device == omp_get_initial_device()) { - DP("copy from device to host\n"); - DeviceTy& SrcDev = Devices[src_device]; - rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr); - } else { - DP("copy from device to device\n"); - void *buffer = malloc(length); - DeviceTy& SrcDev = Devices[src_device]; - DeviceTy& DstDev = Devices[dst_device]; - rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr); - if (rc == OFFLOAD_SUCCESS) - rc = DstDev.data_submit(dstAddr, buffer, length, nullptr); - free(buffer); - } - - DP("omp_target_memcpy returns %d\n", rc); - return rc; -} - -EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, - int num_dims, const size_t *volume, const size_t *dst_offsets, - const size_t *src_offsets, const size_t *dst_dimensions, - const size_t *src_dimensions, int dst_device, int src_device) { - DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " - "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " - "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " - "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device, - src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets), - DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions), - DPxPTR(volume), element_size, num_dims); - - if (!(dst || src)) { - DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n", - INT_MAX); - return INT_MAX; - } - - if (!dst || !src || element_size < 1 || num_dims < 1 || !volume || - !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) { - DP("Call to omp_target_memcpy_rect with invalid arguments\n"); - return OFFLOAD_FAIL; - } - - int rc; - if (num_dims == 1) { - rc = omp_target_memcpy(dst, src, element_size * volume[0], - element_size * dst_offsets[0], element_size * src_offsets[0], - dst_device, src_device); - } else { - size_t dst_slice_size = element_size; - size_t src_slice_size = element_size; - for (int i=1; i + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include +#include +#include + +EXTERN int omp_get_num_devices(void) { + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + + DP("Call to omp_get_num_devices returning %zd\n", Devices_size); + + return Devices_size; +} + +EXTERN int omp_get_initial_device(void) { + DP("Call to omp_get_initial_device returning %d\n", HOST_DEVICE); + return HOST_DEVICE; +} + +EXTERN void *omp_target_alloc(size_t size, int device_num) { + DP("Call to omp_target_alloc for device %d requesting %zu bytes\n", + device_num, size); + + if (size <= 0) { + DP("Call to omp_target_alloc with non-positive length\n"); + return NULL; + } + + void *rc = NULL; + + if (device_num == omp_get_initial_device()) { + rc = malloc(size); + DP("omp_target_alloc returns host ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_alloc returns NULL ptr\n"); + return NULL; + } + + DeviceTy &Device = Devices[device_num]; + rc = Device.RTL->data_alloc(Device.RTLDeviceID, size, NULL); + DP("omp_target_alloc returns device ptr " DPxMOD "\n", DPxPTR(rc)); + return rc; +} + +EXTERN void omp_target_free(void *device_ptr, int device_num) { + DP("Call to omp_target_free for device %d and address " DPxMOD "\n", + device_num, DPxPTR(device_ptr)); + + if (!device_ptr) { + DP("Call to omp_target_free with NULL ptr\n"); + return; + } + + if (device_num == omp_get_initial_device()) { + free(device_ptr); + DP("omp_target_free deallocated host ptr\n"); + return; + } + + if (!device_is_ready(device_num)) { + DP("omp_target_free returns, nothing to do\n"); + return; + } + + DeviceTy &Device = Devices[device_num]; + Device.RTL->data_delete(Device.RTLDeviceID, (void *)device_ptr); + DP("omp_target_free deallocated device ptr\n"); +} + +EXTERN int omp_target_is_present(void *ptr, int device_num) { + DP("Call to omp_target_is_present for device %d and address " DPxMOD "\n", + device_num, DPxPTR(ptr)); + + if (!ptr) { + DP("Call to omp_target_is_present with NULL ptr, returning false\n"); + return false; + } + + if (device_num == omp_get_initial_device()) { + DP("Call to omp_target_is_present on host, returning true\n"); + return true; + } + + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + if (Devices_size <= (size_t)device_num) { + DP("Call to omp_target_is_present with invalid device ID, returning " + "false\n"); + return false; + } + + DeviceTy& Device = Devices[device_num]; + bool IsLast; // not used + bool IsHostPtr; + void *TgtPtr = Device.getTgtPtrBegin(ptr, 0, IsLast, false, IsHostPtr); + int rc = (TgtPtr != NULL); + // Under unified memory the host pointer can be returned by the + // getTgtPtrBegin() function which means that there is no device + // corresponding point for ptr. This function should return false + // in that situation. + if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) + rc = !IsHostPtr; + DP("Call to omp_target_is_present returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_memcpy(void *dst, void *src, size_t length, + size_t dst_offset, size_t src_offset, int dst_device, int src_device) { + DP("Call to omp_target_memcpy, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offset %zu, " + "src offset %zu, length %zu\n", dst_device, src_device, DPxPTR(dst), + DPxPTR(src), dst_offset, src_offset, length); + + if (!dst || !src || length <= 0) { + DP("Call to omp_target_memcpy with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + if (src_device != omp_get_initial_device() && !device_is_ready(src_device)) { + DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + if (dst_device != omp_get_initial_device() && !device_is_ready(dst_device)) { + DP("omp_target_memcpy returns OFFLOAD_FAIL\n"); + return OFFLOAD_FAIL; + } + + int rc = OFFLOAD_SUCCESS; + void *srcAddr = (char *)src + src_offset; + void *dstAddr = (char *)dst + dst_offset; + + if (src_device == omp_get_initial_device() && + dst_device == omp_get_initial_device()) { + DP("copy from host to host\n"); + const void *p = memcpy(dstAddr, srcAddr, length); + if (p == NULL) + rc = OFFLOAD_FAIL; + } else if (src_device == omp_get_initial_device()) { + DP("copy from host to device\n"); + DeviceTy& DstDev = Devices[dst_device]; + rc = DstDev.data_submit(dstAddr, srcAddr, length, nullptr); + } else if (dst_device == omp_get_initial_device()) { + DP("copy from device to host\n"); + DeviceTy& SrcDev = Devices[src_device]; + rc = SrcDev.data_retrieve(dstAddr, srcAddr, length, nullptr); + } else { + DP("copy from device to device\n"); + DeviceTy& SrcDev = Devices[src_device]; + DeviceTy& DstDev = Devices[dst_device]; + if (SrcDev.RTL->RTLName != DstDev.RTL->RTLName) { + void *buffer = malloc(length); + rc = SrcDev.data_retrieve(buffer, srcAddr, length, nullptr); + if (rc == OFFLOAD_SUCCESS) + rc = DstDev.data_submit(dstAddr, buffer, length, nullptr); + free(buffer); + } else { + SrcDev.data_transfer(dstAddr, srcAddr, length, nullptr); + } + } + + DP("omp_target_memcpy returns %d\n", rc); + return rc; +} + +EXTERN int omp_target_memcpy_rect(void *dst, void *src, size_t element_size, + int num_dims, const size_t *volume, const size_t *dst_offsets, + const size_t *src_offsets, const size_t *dst_dimensions, + const size_t *src_dimensions, int dst_device, int src_device) { + DP("Call to omp_target_memcpy_rect, dst device %d, src device %d, " + "dst addr " DPxMOD ", src addr " DPxMOD ", dst offsets " DPxMOD ", " + "src offsets " DPxMOD ", dst dims " DPxMOD ", src dims " DPxMOD ", " + "volume " DPxMOD ", element size %zu, num_dims %d\n", dst_device, + src_device, DPxPTR(dst), DPxPTR(src), DPxPTR(dst_offsets), + DPxPTR(src_offsets), DPxPTR(dst_dimensions), DPxPTR(src_dimensions), + DPxPTR(volume), element_size, num_dims); + + if (!(dst || src)) { + DP("Call to omp_target_memcpy_rect returns max supported dimensions %d\n", + INT_MAX); + return INT_MAX; + } + + if (!dst || !src || element_size < 1 || num_dims < 1 || !volume || + !dst_offsets || !src_offsets || !dst_dimensions || !src_dimensions) { + DP("Call to omp_target_memcpy_rect with invalid arguments\n"); + return OFFLOAD_FAIL; + } + + int rc; + if (num_dims == 1) { + rc = omp_target_memcpy(dst, src, element_size * volume[0], + element_size * dst_offsets[0], element_size * src_offsets[0], + dst_device, src_device); + } else { + size_t dst_slice_size = element_size; + size_t src_slice_size = element_size; + for (int i=1; i -#include -#include - -/// Map between Device ID (i.e. openmp device id) and its DeviceTy. -DevicesTy Devices; - -int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) { - DataMapMtx.lock(); - - // Check if entry exists - for (auto &HT : HostDataToTargetMap) { - if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) { - // Mapping already exists - bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin && - HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size && - HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin; - DataMapMtx.unlock(); - if (isValid) { - DP("Attempt to re-associate the same device ptr+offset with the same " - "host ptr, nothing to do\n"); - return OFFLOAD_SUCCESS; - } else { - DP("Not allowed to re-associate a different device ptr+offset with the " - "same host ptr\n"); - return OFFLOAD_FAIL; - } - } - } - - // Mapping does not exist, allocate it with refCount=INF - HostDataToTargetTy newEntry((uintptr_t) HstPtrBegin /*HstPtrBase*/, - (uintptr_t) HstPtrBegin /*HstPtrBegin*/, - (uintptr_t) HstPtrBegin + Size /*HstPtrEnd*/, - (uintptr_t) TgtPtrBegin /*TgtPtrBegin*/, - true /*IsRefCountINF*/); - - DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd=" - DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase), - DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd), - DPxPTR(newEntry.TgtPtrBegin)); - HostDataToTargetMap.push_front(newEntry); - - DataMapMtx.unlock(); - - return OFFLOAD_SUCCESS; -} - -int DeviceTy::disassociatePtr(void *HstPtrBegin) { - DataMapMtx.lock(); - - // Check if entry exists - for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin(); - ii != HostDataToTargetMap.end(); ++ii) { - if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) { - // Mapping exists - if (ii->isRefCountInf()) { - DP("Association found, removing it\n"); - HostDataToTargetMap.erase(ii); - DataMapMtx.unlock(); - return OFFLOAD_SUCCESS; - } else { - DP("Trying to disassociate a pointer which was not mapped via " - "omp_target_associate_ptr\n"); - break; - } - } - } - - // Mapping not found - DataMapMtx.unlock(); - DP("Association not found\n"); - return OFFLOAD_FAIL; -} - -// Get ref count of map entry containing HstPtrBegin -uint64_t DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) { - uintptr_t hp = (uintptr_t)HstPtrBegin; - uint64_t RefCnt = 0; - - DataMapMtx.lock(); - for (auto &HT : HostDataToTargetMap) { - if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) { - DP("DeviceTy::getMapEntry: requested entry found\n"); - RefCnt = HT.getRefCount(); - break; - } - } - DataMapMtx.unlock(); - - if (RefCnt == 0) { - DP("DeviceTy::getMapEntry: requested entry not found\n"); - } - - return RefCnt; -} - -LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) { - uintptr_t hp = (uintptr_t)HstPtrBegin; - LookupResult lr; - - DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp), - Size); - for (lr.Entry = HostDataToTargetMap.begin(); - lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) { - auto &HT = *lr.Entry; - // Is it contained? - lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd && - (hp+Size) <= HT.HstPtrEnd; - // Does it extend into an already mapped region? - lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin; - // Does it extend beyond the mapped region? - lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd; - - if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || - lr.Flags.ExtendsAfter) { - break; - } - } - - if (lr.Flags.ExtendsBefore) { - DP("WARNING: Pointer is not mapped but section extends into already " - "mapped data\n"); - } - if (lr.Flags.ExtendsAfter) { - DP("WARNING: Pointer is already mapped but section extends beyond mapped " - "region\n"); - } - - return lr; -} - -// Used by target_data_begin -// Return the target pointer begin (where the data will be moved). -// Allocate memory if this is the first occurrence of this mapping. -// Increment the reference counter. -// If NULL is returned, then either data allocation failed or the user tried -// to do an illegal mapping. -void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, - int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit, - bool UpdateRefCount, bool HasCloseModifier) { - void *rc = NULL; - IsHostPtr = false; - DataMapMtx.lock(); - LookupResult lr = lookupMapping(HstPtrBegin, Size); - - // Check if the pointer is contained. - // If a variable is mapped to the device manually by the user - which would - // lead to the IsContained flag to be true - then we must ensure that the - // device address is returned even under unified memory conditions. - if (lr.Flags.IsContained || - ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) { - auto &HT = *lr.Entry; - IsNew = false; - - if (UpdateRefCount) - HT.incRefCount(); - - uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); - DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " - "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""), - DPxPTR(HstPtrBegin), DPxPTR(tp), Size, - (UpdateRefCount ? " updated" : ""), - HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str()); - rc = (void *)tp; - } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) { - // Explicit extension of mapped data - not allowed. - DP("Explicit extension of mapping is not allowed.\n"); - } else if (Size) { - // If unified shared memory is active, implicitly mapped variables that are not - // privatized use host address. Any explicitly mapped variables also use - // host address where correctness is not impeded. In all other cases - // maps are respected. - // In addition to the mapping rules above, the close map - // modifier forces the mapping of the variable to the device. - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - !HasCloseModifier) { - DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n", - DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : "")); - IsHostPtr = true; - rc = HstPtrBegin; - } else { - // If it is not contained and Size > 0 we should create a new entry for it. - IsNew = true; - uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); - DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " - "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), - DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); - HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase, - (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp)); - rc = (void *)tp; - } - } - - DataMapMtx.unlock(); - return rc; -} - -// Used by target_data_begin, target_data_end, target_data_update and target. -// Return the target pointer begin (where the data will be moved). -// Decrement the reference counter if called from target_data_end. -void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, - bool UpdateRefCount, bool &IsHostPtr) { - void *rc = NULL; - IsHostPtr = false; - IsLast = false; - DataMapMtx.lock(); - LookupResult lr = lookupMapping(HstPtrBegin, Size); - - if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { - auto &HT = *lr.Entry; - IsLast = HT.getRefCount() == 1; - - if (!IsLast && UpdateRefCount) - HT.decRefCount(); - - uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); - DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " - "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size, - (UpdateRefCount ? " updated" : ""), - HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str()); - rc = (void *)tp; - } else if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { - // If the value isn't found in the mapping and unified shared memory - // is on then it means we have stumbled upon a value which we need to - // use directly from the host. - DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n", - DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : "")); - IsHostPtr = true; - rc = HstPtrBegin; - } - - DataMapMtx.unlock(); - return rc; -} - -// Return the target pointer begin (where the data will be moved). -// Lock-free version called when loading global symbols from the fat binary. -void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) { - uintptr_t hp = (uintptr_t)HstPtrBegin; - LookupResult lr = lookupMapping(HstPtrBegin, Size); - if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { - auto &HT = *lr.Entry; - uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin); - return (void *)tp; - } - - return NULL; -} - -int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete, - bool HasCloseModifier) { - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) - return OFFLOAD_SUCCESS; - // Check if the pointer is contained in any sub-nodes. - int rc; - DataMapMtx.lock(); - LookupResult lr = lookupMapping(HstPtrBegin, Size); - if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { - auto &HT = *lr.Entry; - if (ForceDelete) - HT.resetRefCount(); - if (HT.decRefCount() == 0) { - DP("Deleting tgt data " DPxMOD " of size %ld\n", - DPxPTR(HT.TgtPtrBegin), Size); - RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin); - DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD - ", Size=%ld\n", (ForceDelete ? " (forced)" : ""), - DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size); - HostDataToTargetMap.erase(lr.Entry); - } - rc = OFFLOAD_SUCCESS; - } else { - DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated" - " memory\n", DPxPTR(HstPtrBegin)); - rc = OFFLOAD_FAIL; - } - - DataMapMtx.unlock(); - return rc; -} - -/// Init device, should not be called directly. -void DeviceTy::init() { - // Make call to init_requires if it exists for this plugin. - if (RTL->init_requires) - RTL->init_requires(RTLs->RequiresFlags); - int32_t rc = RTL->init_device(RTLDeviceID); - if (rc == OFFLOAD_SUCCESS) { - IsInit = true; - } -} - -/// Thread-safe method to initialize the device only once. -int32_t DeviceTy::initOnce() { - std::call_once(InitFlag, &DeviceTy::init, this); - - // At this point, if IsInit is true, then either this thread or some other - // thread in the past successfully initialized the device, so we can return - // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it - // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means - // that some other thread already attempted to execute init() and if IsInit - // is still false, return OFFLOAD_FAIL. - if (IsInit) - return OFFLOAD_SUCCESS; - else - return OFFLOAD_FAIL; -} - -// Load binary to device. -__tgt_target_table *DeviceTy::load_binary(void *Img) { - RTL->Mtx.lock(); - __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img); - RTL->Mtx.unlock(); - return rc; -} - -// Submit data to device -int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, - int64_t Size, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize) - return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); - else - return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, - AsyncInfoPtr); -} - -// Retrieve data from device -int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, - int64_t Size, __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize) - return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); - else - return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, - AsyncInfoPtr); -} - -// Run region on device -int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize) - return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, - TgtVarsSize); - else - return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, AsyncInfoPtr); -} - -// Run team region on device. -int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount, - __tgt_async_info *AsyncInfoPtr) { - if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize) - return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit, - LoopTripCount); - else - return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, - TgtOffsets, TgtVarsSize, NumTeams, - ThreadLimit, LoopTripCount, AsyncInfoPtr); -} - -/// Check whether a device has an associated RTL and initialize it if it's not -/// already initialized. -bool device_is_ready(int device_num) { - DP("Checking whether device %d is ready.\n", device_num); - // Devices.size() can only change while registering a new - // library, so try to acquire the lock of RTLs' mutex. - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); - if (Devices_size <= (size_t)device_num) { - DP("Device ID %d does not have a matching RTL\n", device_num); - return false; - } - - // Get device info - DeviceTy &Device = Devices[device_num]; - - DP("Is the device %d (local ID %d) initialized? %d\n", device_num, - Device.RTLDeviceID, Device.IsInit); - - // Init the device if not done before - if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) { - DP("Failed to init device %d\n", device_num); - return false; - } - - DP("Device %d is ready to use.\n", device_num); - - return true; -} +//===--------- device.cpp - Target independent OpenMP target RTL ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Functionality for managing devices that are handled by RTL plugins. +// +//===----------------------------------------------------------------------===// + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include +#include +#include + +/// Map between Device ID (i.e. openmp device id) and its DeviceTy. +DevicesTy Devices; + +int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) { + DataMapMtx.lock(); + + // Check if entry exists + for (auto &HT : HostDataToTargetMap) { + if ((uintptr_t)HstPtrBegin == HT.HstPtrBegin) { + // Mapping already exists + bool isValid = HT.HstPtrBegin == (uintptr_t) HstPtrBegin && + HT.HstPtrEnd == (uintptr_t) HstPtrBegin + Size && + HT.TgtPtrBegin == (uintptr_t) TgtPtrBegin; + DataMapMtx.unlock(); + if (isValid) { + DP("Attempt to re-associate the same device ptr+offset with the same " + "host ptr, nothing to do\n"); + return OFFLOAD_SUCCESS; + } else { + DP("Not allowed to re-associate a different device ptr+offset with the " + "same host ptr\n"); + return OFFLOAD_FAIL; + } + } + } + + // Mapping does not exist, allocate it with refCount=INF + HostDataToTargetTy newEntry((uintptr_t) HstPtrBegin /*HstPtrBase*/, + (uintptr_t) HstPtrBegin /*HstPtrBegin*/, + (uintptr_t) HstPtrBegin + Size /*HstPtrEnd*/, + (uintptr_t) TgtPtrBegin /*TgtPtrBegin*/, + true /*IsRefCountINF*/); + + DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", HstEnd=" + DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(newEntry.HstPtrBase), + DPxPTR(newEntry.HstPtrBegin), DPxPTR(newEntry.HstPtrEnd), + DPxPTR(newEntry.TgtPtrBegin)); + HostDataToTargetMap.push_front(newEntry); + + DataMapMtx.unlock(); + + return OFFLOAD_SUCCESS; +} + +int DeviceTy::disassociatePtr(void *HstPtrBegin) { + DataMapMtx.lock(); + + // Check if entry exists + for (HostDataToTargetListTy::iterator ii = HostDataToTargetMap.begin(); + ii != HostDataToTargetMap.end(); ++ii) { + if ((uintptr_t)HstPtrBegin == ii->HstPtrBegin) { + // Mapping exists + if (ii->isRefCountInf()) { + DP("Association found, removing it\n"); + HostDataToTargetMap.erase(ii); + DataMapMtx.unlock(); + return OFFLOAD_SUCCESS; + } else { + DP("Trying to disassociate a pointer which was not mapped via " + "omp_target_associate_ptr\n"); + break; + } + } + } + + // Mapping not found + DataMapMtx.unlock(); + DP("Association not found\n"); + return OFFLOAD_FAIL; +} + +// Get ref count of map entry containing HstPtrBegin +uint64_t DeviceTy::getMapEntryRefCnt(void *HstPtrBegin) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + uint64_t RefCnt = 0; + + DataMapMtx.lock(); + for (auto &HT : HostDataToTargetMap) { + if (hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd) { + DP("DeviceTy::getMapEntry: requested entry found\n"); + RefCnt = HT.getRefCount(); + break; + } + } + DataMapMtx.unlock(); + + if (RefCnt == 0) { + DP("DeviceTy::getMapEntry: requested entry not found\n"); + } + + return RefCnt; +} + +LookupResult DeviceTy::lookupMapping(void *HstPtrBegin, int64_t Size) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + LookupResult lr; + + DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%ld)...\n", DPxPTR(hp), + Size); + for (lr.Entry = HostDataToTargetMap.begin(); + lr.Entry != HostDataToTargetMap.end(); ++lr.Entry) { + auto &HT = *lr.Entry; + // Is it contained? + lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd && + (hp+Size) <= HT.HstPtrEnd; + // Does it extend into an already mapped region? + lr.Flags.ExtendsBefore = hp < HT.HstPtrBegin && (hp+Size) > HT.HstPtrBegin; + // Does it extend beyond the mapped region? + lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp+Size) > HT.HstPtrEnd; + + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || + lr.Flags.ExtendsAfter) { + break; + } + } + + if (lr.Flags.ExtendsBefore) { + DP("WARNING: Pointer is not mapped but section extends into already " + "mapped data\n"); + } + if (lr.Flags.ExtendsAfter) { + DP("WARNING: Pointer is already mapped but section extends beyond mapped " + "region\n"); + } + + return lr; +} + +// Used by target_data_begin +// Return the target pointer begin (where the data will be moved). +// Allocate memory if this is the first occurrence of this mapping. +// Increment the reference counter. +// If NULL is returned, then either data allocation failed or the user tried +// to do an illegal mapping. +void *DeviceTy::getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, + int64_t Size, bool &IsNew, bool &IsHostPtr, bool IsImplicit, + bool UpdateRefCount, bool HasCloseModifier) { + void *rc = NULL; + IsHostPtr = false; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + + // Check if the pointer is contained. + // If a variable is mapped to the device manually by the user - which would + // lead to the IsContained flag to be true - then we must ensure that the + // device address is returned even under unified memory conditions. + if (lr.Flags.IsContained || + ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && IsImplicit)) { + auto &HT = *lr.Entry; + IsNew = false; + + if (UpdateRefCount) + HT.incRefCount(); + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", (IsImplicit ? " (implicit)" : ""), + DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str()); + rc = (void *)tp; + } else if ((lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) && !IsImplicit) { + // Explicit extension of mapped data - not allowed. + DP("Explicit extension of mapping is not allowed.\n"); + } else if (Size) { + // If unified shared memory is active, implicitly mapped variables that are not + // privatized use host address. Any explicitly mapped variables also use + // host address where correctness is not impeded. In all other cases + // maps are respected. + // In addition to the mapping rules above, the close map + // modifier forces the mapping of the variable to the device. + if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + !HasCloseModifier) { + DP("Return HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n", + DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : "")); + IsHostPtr = true; + rc = HstPtrBegin; + } else { + // If it is not contained and Size > 0 we should create a new entry for it. + IsNew = true; + uintptr_t tp = (uintptr_t)RTL->data_alloc(RTLDeviceID, Size, HstPtrBegin); + DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD ", " + "HstEnd=" DPxMOD ", TgtBegin=" DPxMOD "\n", DPxPTR(HstPtrBase), + DPxPTR(HstPtrBegin), DPxPTR((uintptr_t)HstPtrBegin + Size), DPxPTR(tp)); + HostDataToTargetMap.push_front(HostDataToTargetTy((uintptr_t)HstPtrBase, + (uintptr_t)HstPtrBegin, (uintptr_t)HstPtrBegin + Size, tp)); + rc = (void *)tp; + } + } + + DataMapMtx.unlock(); + return rc; +} + +// Used by target_data_begin, target_data_end, target_data_update and target. +// Return the target pointer begin (where the data will be moved). +// Decrement the reference counter if called from target_data_end. +void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, + bool UpdateRefCount, bool &IsHostPtr) { + void *rc = NULL; + IsHostPtr = false; + IsLast = false; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + IsLast = HT.getRefCount() == 1; + + if (!IsLast && UpdateRefCount) + HT.decRefCount(); + + uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin); + DP("Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", " + "Size=%ld,%s RefCount=%s\n", DPxPTR(HstPtrBegin), DPxPTR(tp), Size, + (UpdateRefCount ? " updated" : ""), + HT.isRefCountInf() ? "INF" : std::to_string(HT.getRefCount()).c_str()); + rc = (void *)tp; + } else if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) { + // If the value isn't found in the mapping and unified shared memory + // is on then it means we have stumbled upon a value which we need to + // use directly from the host. + DP("Get HstPtrBegin " DPxMOD " Size=%ld RefCount=%s\n", + DPxPTR((uintptr_t)HstPtrBegin), Size, (UpdateRefCount ? " updated" : "")); + IsHostPtr = true; + rc = HstPtrBegin; + } + + DataMapMtx.unlock(); + return rc; +} + +// Return the target pointer begin (where the data will be moved). +// Lock-free version called when loading global symbols from the fat binary. +void *DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size) { + uintptr_t hp = (uintptr_t)HstPtrBegin; + LookupResult lr = lookupMapping(HstPtrBegin, Size); + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin); + return (void *)tp; + } + + return NULL; +} + +int DeviceTy::deallocTgtPtr(void *HstPtrBegin, int64_t Size, bool ForceDelete, + bool HasCloseModifier) { + if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && !HasCloseModifier) + return OFFLOAD_SUCCESS; + // Check if the pointer is contained in any sub-nodes. + int rc; + DataMapMtx.lock(); + LookupResult lr = lookupMapping(HstPtrBegin, Size); + if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) { + auto &HT = *lr.Entry; + if (ForceDelete) + HT.resetRefCount(); + if (HT.decRefCount() == 0) { + DP("Deleting tgt data " DPxMOD " of size %ld\n", + DPxPTR(HT.TgtPtrBegin), Size); + RTL->data_delete(RTLDeviceID, (void *)HT.TgtPtrBegin); + DP("Removing%s mapping with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD + ", Size=%ld\n", (ForceDelete ? " (forced)" : ""), + DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size); + HostDataToTargetMap.erase(lr.Entry); + } + rc = OFFLOAD_SUCCESS; + } else { + DP("Section to delete (hst addr " DPxMOD ") does not exist in the allocated" + " memory\n", DPxPTR(HstPtrBegin)); + rc = OFFLOAD_FAIL; + } + + DataMapMtx.unlock(); + return rc; +} + +/// Init device, should not be called directly. +void DeviceTy::init() { + // Make call to init_requires if it exists for this plugin. + if (RTL->init_requires) + RTL->init_requires(RTLs->RequiresFlags); + int32_t rc = RTL->init_device(RTLDeviceID); + if (rc == OFFLOAD_SUCCESS) { + IsInit = true; + } +} + +/// Thread-safe method to initialize the device only once. +int32_t DeviceTy::initOnce() { + std::call_once(InitFlag, &DeviceTy::init, this); + + // At this point, if IsInit is true, then either this thread or some other + // thread in the past successfully initialized the device, so we can return + // OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it + // failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means + // that some other thread already attempted to execute init() and if IsInit + // is still false, return OFFLOAD_FAIL. + if (IsInit) + return OFFLOAD_SUCCESS; + else + return OFFLOAD_FAIL; +} + +// Load binary to device. +__tgt_target_table *DeviceTy::load_binary(void *Img) { + RTL->Mtx.lock(); + __tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img); + RTL->Mtx.unlock(); + return rc; +} + +// Submit data to device +int32_t DeviceTy::data_submit(void *TgtPtrBegin, void *HstPtrBegin, + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->data_submit_async || !RTL->synchronize) + return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size); + else + return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size, + AsyncInfoPtr); +} + +// Retrieve data from device +int32_t DeviceTy::data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize) + return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size); + else + return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size, + AsyncInfoPtr); +} + +// Transfer data between device from same vendor +int32_t DeviceTy::data_transfer(void *DstPtrBegin, void *SrcPtrBegin, + int64_t Size, __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->data_retrieve_async || !RTL->synchronize) + return RTL->data_transfer(RTLDeviceID, DstPtrBegin, SrcPtrBegin, Size); + else + return RTL->data_transfer_async(RTLDeviceID, DstPtrBegin, SrcPtrBegin, Size, + AsyncInfoPtr); +} + +// Run region on device +int32_t DeviceTy::run_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->run_region || !RTL->synchronize) + return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets, + TgtVarsSize); + else + return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, AsyncInfoPtr); +} + +// Run team region on device. +int32_t DeviceTy::run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfoPtr) { + if (!AsyncInfoPtr || !RTL->run_team_region_async || !RTL->synchronize) + return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit, + LoopTripCount); + else + return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, + TgtOffsets, TgtVarsSize, NumTeams, + ThreadLimit, LoopTripCount, AsyncInfoPtr); +} + +/// Check whether a device has an associated RTL and initialize it if it's not +/// already initialized. +bool device_is_ready(int device_num) { + DP("Checking whether device %d is ready.\n", device_num); + // Devices.size() can only change while registering a new + // library, so try to acquire the lock of RTLs' mutex. + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + if (Devices_size <= (size_t)device_num) { + DP("Device ID %d does not have a matching RTL\n", device_num); + return false; + } + + // Get device info + DeviceTy &Device = Devices[device_num]; + + DP("Is the device %d (local ID %d) initialized? %d\n", device_num, + Device.RTLDeviceID, Device.IsInit); + + // Init the device if not done before + if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) { + DP("Failed to init device %d\n", device_num); + return false; + } + + DP("Device %d is ready to use.\n", device_num); + + return true; +} diff --git a/openmp/libomptarget/src/device.h b/openmp/libomptarget/src/device.h index a3a5767f81ff5..2526c7d5268ce 100644 --- a/openmp/libomptarget/src/device.h +++ b/openmp/libomptarget/src/device.h @@ -1,204 +1,206 @@ -//===----------- device.h - Target independent OpenMP target RTL ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations for managing devices that are handled by RTL plugins. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_DEVICE_H -#define _OMPTARGET_DEVICE_H - -#include -#include -#include -#include -#include -#include - -// Forward declarations. -struct RTLInfoTy; -struct __tgt_bin_desc; -struct __tgt_target_table; -struct __tgt_async_info; - -/// Map between host data and target data. -struct HostDataToTargetTy { - uintptr_t HstPtrBase; // host info. - uintptr_t HstPtrBegin; - uintptr_t HstPtrEnd; // non-inclusive. - - uintptr_t TgtPtrBegin; // target info. - -private: - uint64_t RefCount; - static const uint64_t INFRefCount = ~(uint64_t)0; - -public: - HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB, - bool IsINF = false) - : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), - TgtPtrBegin(TB), RefCount(IsINF ? INFRefCount : 1) {} - - uint64_t getRefCount() const { - return RefCount; - } - - uint64_t resetRefCount() { - if (RefCount != INFRefCount) - RefCount = 1; - - return RefCount; - } - - uint64_t incRefCount() { - if (RefCount != INFRefCount) { - ++RefCount; - assert(RefCount < INFRefCount && "refcount overflow"); - } - - return RefCount; - } - - uint64_t decRefCount() { - if (RefCount != INFRefCount) { - assert(RefCount > 0 && "refcount underflow"); - --RefCount; - } - - return RefCount; - } - - bool isRefCountInf() const { - return RefCount == INFRefCount; - } -}; - -typedef std::list HostDataToTargetListTy; - -struct LookupResult { - struct { - unsigned IsContained : 1; - unsigned ExtendsBefore : 1; - unsigned ExtendsAfter : 1; - } Flags; - - HostDataToTargetListTy::iterator Entry; - - LookupResult() : Flags({0,0,0}), Entry() {} -}; - -/// Map for shadow pointers -struct ShadowPtrValTy { - void *HstPtrVal; - void *TgtPtrAddr; - void *TgtPtrVal; -}; -typedef std::map ShadowPtrListTy; - -/// -struct PendingCtorDtorListsTy { - std::list PendingCtors; - std::list PendingDtors; -}; -typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> - PendingCtorsDtorsPerLibrary; - -struct DeviceTy { - int32_t DeviceID; - RTLInfoTy *RTL; - int32_t RTLDeviceID; - - bool IsInit; - std::once_flag InitFlag; - bool HasPendingGlobals; - - HostDataToTargetListTy HostDataToTargetMap; - PendingCtorsDtorsPerLibrary PendingCtorsDtors; - - ShadowPtrListTy ShadowPtrMap; - - std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx; - - // NOTE: Once libomp gains full target-task support, this state should be - // moved into the target task in libomp. - std::map LoopTripCnt; - - DeviceTy(RTLInfoTy *RTL) - : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), - HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), - ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {} - - // The existence of mutexes makes DeviceTy non-copyable. We need to - // provide a copy constructor and an assignment operator explicitly. - DeviceTy(const DeviceTy &d) - : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID), - IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals), - HostDataToTargetMap(d.HostDataToTargetMap), - PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap), - DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), - LoopTripCnt(d.LoopTripCnt) {} - - DeviceTy& operator=(const DeviceTy &d) { - DeviceID = d.DeviceID; - RTL = d.RTL; - RTLDeviceID = d.RTLDeviceID; - IsInit = d.IsInit; - HasPendingGlobals = d.HasPendingGlobals; - HostDataToTargetMap = d.HostDataToTargetMap; - PendingCtorsDtors = d.PendingCtorsDtors; - ShadowPtrMap = d.ShadowPtrMap; - LoopTripCnt = d.LoopTripCnt; - - return *this; - } - - uint64_t getMapEntryRefCnt(void *HstPtrBegin); - LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); - void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, - bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true, - bool HasCloseModifier = false); - void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); - void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, - bool UpdateRefCount, bool &IsHostPtr); - int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete, - bool HasCloseModifier = false); - int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); - int disassociatePtr(void *HstPtrBegin); - - // calls to RTL - int32_t initOnce(); - __tgt_target_table *load_binary(void *Img); - - // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be - // synchronous. - int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, - __tgt_async_info *AsyncInfoPtr); - int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, - __tgt_async_info *AsyncInfoPtr); - - int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - __tgt_async_info *AsyncInfoPtr); - int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, - ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, - int32_t NumTeams, int32_t ThreadLimit, - uint64_t LoopTripCount, - __tgt_async_info *AsyncInfoPtr); - -private: - // Call to RTL - void init(); // To be called only via DeviceTy::initOnce() -}; - -/// Map between Device ID (i.e. openmp device id) and its DeviceTy. -typedef std::vector DevicesTy; -extern DevicesTy Devices; - -extern bool device_is_ready(int device_num); - -#endif +//===----------- device.h - Target independent OpenMP target RTL ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declarations for managing devices that are handled by RTL plugins. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_DEVICE_H +#define _OMPTARGET_DEVICE_H + +#include +#include +#include +#include +#include +#include + +// Forward declarations. +struct RTLInfoTy; +struct __tgt_bin_desc; +struct __tgt_target_table; +struct __tgt_async_info; + +/// Map between host data and target data. +struct HostDataToTargetTy { + uintptr_t HstPtrBase; // host info. + uintptr_t HstPtrBegin; + uintptr_t HstPtrEnd; // non-inclusive. + + uintptr_t TgtPtrBegin; // target info. + +private: + uint64_t RefCount; + static const uint64_t INFRefCount = ~(uint64_t)0; + +public: + HostDataToTargetTy(uintptr_t BP, uintptr_t B, uintptr_t E, uintptr_t TB, + bool IsINF = false) + : HstPtrBase(BP), HstPtrBegin(B), HstPtrEnd(E), + TgtPtrBegin(TB), RefCount(IsINF ? INFRefCount : 1) {} + + uint64_t getRefCount() const { + return RefCount; + } + + uint64_t resetRefCount() { + if (RefCount != INFRefCount) + RefCount = 1; + + return RefCount; + } + + uint64_t incRefCount() { + if (RefCount != INFRefCount) { + ++RefCount; + assert(RefCount < INFRefCount && "refcount overflow"); + } + + return RefCount; + } + + uint64_t decRefCount() { + if (RefCount != INFRefCount) { + assert(RefCount > 0 && "refcount underflow"); + --RefCount; + } + + return RefCount; + } + + bool isRefCountInf() const { + return RefCount == INFRefCount; + } +}; + +typedef std::list HostDataToTargetListTy; + +struct LookupResult { + struct { + unsigned IsContained : 1; + unsigned ExtendsBefore : 1; + unsigned ExtendsAfter : 1; + } Flags; + + HostDataToTargetListTy::iterator Entry; + + LookupResult() : Flags({0,0,0}), Entry() {} +}; + +/// Map for shadow pointers +struct ShadowPtrValTy { + void *HstPtrVal; + void *TgtPtrAddr; + void *TgtPtrVal; +}; +typedef std::map ShadowPtrListTy; + +/// +struct PendingCtorDtorListsTy { + std::list PendingCtors; + std::list PendingDtors; +}; +typedef std::map<__tgt_bin_desc *, PendingCtorDtorListsTy> + PendingCtorsDtorsPerLibrary; + +struct DeviceTy { + int32_t DeviceID; + RTLInfoTy *RTL; + int32_t RTLDeviceID; + + bool IsInit; + std::once_flag InitFlag; + bool HasPendingGlobals; + + HostDataToTargetListTy HostDataToTargetMap; + PendingCtorsDtorsPerLibrary PendingCtorsDtors; + + ShadowPtrListTy ShadowPtrMap; + + std::mutex DataMapMtx, PendingGlobalsMtx, ShadowMtx; + + // NOTE: Once libomp gains full target-task support, this state should be + // moved into the target task in libomp. + std::map LoopTripCnt; + + DeviceTy(RTLInfoTy *RTL) + : DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(), + HasPendingGlobals(false), HostDataToTargetMap(), PendingCtorsDtors(), + ShadowPtrMap(), DataMapMtx(), PendingGlobalsMtx(), ShadowMtx() {} + + // The existence of mutexes makes DeviceTy non-copyable. We need to + // provide a copy constructor and an assignment operator explicitly. + DeviceTy(const DeviceTy &d) + : DeviceID(d.DeviceID), RTL(d.RTL), RTLDeviceID(d.RTLDeviceID), + IsInit(d.IsInit), InitFlag(), HasPendingGlobals(d.HasPendingGlobals), + HostDataToTargetMap(d.HostDataToTargetMap), + PendingCtorsDtors(d.PendingCtorsDtors), ShadowPtrMap(d.ShadowPtrMap), + DataMapMtx(), PendingGlobalsMtx(), ShadowMtx(), + LoopTripCnt(d.LoopTripCnt) {} + + DeviceTy& operator=(const DeviceTy &d) { + DeviceID = d.DeviceID; + RTL = d.RTL; + RTLDeviceID = d.RTLDeviceID; + IsInit = d.IsInit; + HasPendingGlobals = d.HasPendingGlobals; + HostDataToTargetMap = d.HostDataToTargetMap; + PendingCtorsDtors = d.PendingCtorsDtors; + ShadowPtrMap = d.ShadowPtrMap; + LoopTripCnt = d.LoopTripCnt; + + return *this; + } + + uint64_t getMapEntryRefCnt(void *HstPtrBegin); + LookupResult lookupMapping(void *HstPtrBegin, int64_t Size); + void *getOrAllocTgtPtr(void *HstPtrBegin, void *HstPtrBase, int64_t Size, + bool &IsNew, bool &IsHostPtr, bool IsImplicit, bool UpdateRefCount = true, + bool HasCloseModifier = false); + void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size); + void *getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast, + bool UpdateRefCount, bool &IsHostPtr); + int deallocTgtPtr(void *TgtPtrBegin, int64_t Size, bool ForceDelete, + bool HasCloseModifier = false); + int associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size); + int disassociatePtr(void *HstPtrBegin); + + // calls to RTL + int32_t initOnce(); + __tgt_target_table *load_binary(void *Img); + + // Data transfer. When AsyncInfoPtr is nullptr, the transfer will be + // synchronous. + int32_t data_submit(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + int32_t data_retrieve(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + int32_t data_transfer(void *DstPtrBegin, void *SrcPtrBegin, int64_t Size, + __tgt_async_info *AsyncInfoPtr); + + int32_t run_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + __tgt_async_info *AsyncInfoPtr); + int32_t run_team_region(void *TgtEntryPtr, void **TgtVarsPtr, + ptrdiff_t *TgtOffsets, int32_t TgtVarsSize, + int32_t NumTeams, int32_t ThreadLimit, + uint64_t LoopTripCount, + __tgt_async_info *AsyncInfoPtr); + +private: + // Call to RTL + void init(); // To be called only via DeviceTy::initOnce() +}; + +/// Map between Device ID (i.e. openmp device id) and its DeviceTy. +typedef std::vector DevicesTy; +extern DevicesTy Devices; + +extern bool device_is_ready(int device_num); + +#endif diff --git a/openmp/libomptarget/src/exports b/openmp/libomptarget/src/exports index e1fee4bbefcec..e8f35e531db2a 100644 --- a/openmp/libomptarget/src/exports +++ b/openmp/libomptarget/src/exports @@ -1,31 +1,31 @@ -VERS1.0 { - global: - __tgt_register_requires; - __tgt_register_lib; - __tgt_unregister_lib; - __tgt_target_data_begin; - __tgt_target_data_end; - __tgt_target_data_update; - __tgt_target; - __tgt_target_teams; - __tgt_target_data_begin_nowait; - __tgt_target_data_end_nowait; - __tgt_target_data_update_nowait; - __tgt_target_nowait; - __tgt_target_teams_nowait; - __tgt_mapper_num_components; - __tgt_push_mapper_component; - omp_get_num_devices; - omp_get_initial_device; - omp_target_alloc; - omp_target_free; - omp_target_is_present; - omp_target_memcpy; - omp_target_memcpy_rect; - omp_target_associate_ptr; - omp_target_disassociate_ptr; - __kmpc_push_target_tripcount; - local: - *; -}; - +VERS1.0 { + global: + __tgt_register_requires; + __tgt_register_lib; + __tgt_unregister_lib; + __tgt_target_data_begin; + __tgt_target_data_end; + __tgt_target_data_update; + __tgt_target; + __tgt_target_teams; + __tgt_target_data_begin_nowait; + __tgt_target_data_end_nowait; + __tgt_target_data_update_nowait; + __tgt_target_nowait; + __tgt_target_teams_nowait; + __tgt_mapper_num_components; + __tgt_push_mapper_component; + omp_get_num_devices; + omp_get_initial_device; + omp_target_alloc; + omp_target_free; + omp_target_is_present; + omp_target_memcpy; + omp_target_memcpy_rect; + omp_target_associate_ptr; + omp_target_disassociate_ptr; + __kmpc_push_target_tripcount; + local: + *; +}; + diff --git a/openmp/libomptarget/src/interface.cpp b/openmp/libomptarget/src/interface.cpp index 924bc490b1107..5e2aff6c82dbc 100644 --- a/openmp/libomptarget/src/interface.cpp +++ b/openmp/libomptarget/src/interface.cpp @@ -1,350 +1,350 @@ -//===-------- interface.cpp - Target independent OpenMP target RTL --------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implementation of the interface to be used by Clang during the codegen of a -// target region. -// -//===----------------------------------------------------------------------===// - -#include - -#include "device.h" -#include "private.h" -#include "rtl.h" - -#include -#include -#include - -// Store target policy (disabled, mandatory, default) -kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default; -std::mutex TargetOffloadMtx; - -//////////////////////////////////////////////////////////////////////////////// -/// manage the success or failure of a target construct - -static void HandleDefaultTargetOffload() { - TargetOffloadMtx.lock(); - if (TargetOffloadPolicy == tgt_default) { - if (omp_get_num_devices() > 0) { - DP("Default TARGET OFFLOAD policy is now mandatory " - "(devices were found)\n"); - TargetOffloadPolicy = tgt_mandatory; - } else { - DP("Default TARGET OFFLOAD policy is now disabled " - "(no devices were found)\n"); - TargetOffloadPolicy = tgt_disabled; - } - } - TargetOffloadMtx.unlock(); -} - -static int IsOffloadDisabled() { - if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload(); - return TargetOffloadPolicy == tgt_disabled; -} - -static void HandleTargetOutcome(bool success) { - switch (TargetOffloadPolicy) { - case tgt_disabled: - if (success) { - FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); - } - break; - case tgt_default: - FATAL_MESSAGE0(1, "default offloading policy must be switched to " - "mandatory or disabled"); - break; - case tgt_mandatory: - if (!success) { - FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory"); - } - break; - } -} - -//////////////////////////////////////////////////////////////////////////////// -/// adds requires flags -EXTERN void __tgt_register_requires(int64_t flags) { - RTLs->RegisterRequires(flags); -} - -//////////////////////////////////////////////////////////////////////////////// -/// adds a target shared library to the target execution image -EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) { - RTLs->RegisterLib(desc); -} - -//////////////////////////////////////////////////////////////////////////////// -/// unloads a target shared library -EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { - RTLs->UnregisterLib(desc); -} - -/// creates host-to-target data mapping, stores it in the -/// libomptarget.so internal structure (an entry in a stack of data maps) -/// and passes the data to the device. -EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return; - - DP("Entering data begin region for device %" PRId64 " with %d mappings\n", - device_id, arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - DP("Use default device id %" PRId64 "\n", device_id); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return; - } - - DeviceTy &Device = Devices[device_id]; - -#ifdef OMPTARGET_DEBUG - for (int i = 0; i < arg_num; ++i) { - DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 - ", Type=0x%" PRIx64 "\n", - i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); - } -#endif - - int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types, nullptr); - HandleTargetOutcome(rc == OFFLOAD_SUCCESS); -} - -EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t depNum, void *depList, int32_t noAliasDepNum, - void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, - arg_types); -} - -/// passes data from the target, releases target memory and destroys -/// the host-target mapping (top entry from the stack of data maps) -/// created by the last __tgt_target_data_begin. -EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return; - DP("Entering data end region with %d mappings\n", arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - RTLsMtx->lock(); - size_t Devices_size = Devices.size(); - RTLsMtx->unlock(); - if (Devices_size <= (size_t)device_id) { - DP("Device ID %" PRId64 " does not have a matching RTL.\n", device_id); - HandleTargetOutcome(false); - return; - } - - DeviceTy &Device = Devices[device_id]; - if (!Device.IsInit) { - DP("Uninit device: ignore"); - HandleTargetOutcome(false); - return; - } - -#ifdef OMPTARGET_DEBUG - for (int i=0; i 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, - arg_types); -} - -EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return; - DP("Entering data update with %d mappings\n", arg_num); - - // No devices available? - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return; - } - - DeviceTy& Device = Devices[device_id]; - int rc = target_data_update(Device, arg_num, args_base, - args, arg_sizes, arg_types); - HandleTargetOutcome(rc == OFFLOAD_SUCCESS); -} - -EXTERN void __tgt_target_data_update_nowait( - int64_t device_id, int32_t arg_num, void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, - int32_t noAliasDepNum, void *noAliasDepList) { - if (depNum + noAliasDepNum > 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes, - arg_types); -} - -EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - if (IsOffloadDisabled()) return OFFLOAD_FAIL; - DP("Entering target region with entry point " DPxMOD " and device Id %" - PRId64 "\n", DPxPTR(host_ptr), device_id); - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return OFFLOAD_FAIL; - } - -#ifdef OMPTARGET_DEBUG - for (int i=0; i 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, - arg_types); -} - -EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr, - int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, int32_t team_num, int32_t thread_limit) { - if (IsOffloadDisabled()) return OFFLOAD_FAIL; - DP("Entering target region with entry point " DPxMOD " and device Id %" - PRId64 "\n", DPxPTR(host_ptr), device_id); - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return OFFLOAD_FAIL; - } - -#ifdef OMPTARGET_DEBUG - for (int i=0; i 0) - __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); - - return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args, - arg_sizes, arg_types, team_num, thread_limit); -} - -// Get the current number of components for a user-defined mapper. -EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) { - auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; - int64_t size = MapperComponentsPtr->Components.size(); - DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", - DPxPTR(rt_mapper_handle), size); - return size; -} - -// Push back one component for a user-defined mapper. -EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base, - void *begin, int64_t size, - int64_t type) { - DP("__tgt_push_mapper_component(Handle=" DPxMOD - ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 - ", Type=0x%" PRIx64 ").\n", - DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type); - auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; - MapperComponentsPtr->Components.push_back( - MapComponentInfoTy(base, begin, size, type)); -} - -EXTERN void __kmpc_push_target_tripcount(int64_t device_id, - uint64_t loop_tripcount) { - if (IsOffloadDisabled()) - return; - - if (device_id == OFFLOAD_DEVICE_DEFAULT) { - device_id = omp_get_default_device(); - } - - if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { - DP("Failed to get device %" PRId64 " ready\n", device_id); - HandleTargetOutcome(false); - return; - } - - DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id, - loop_tripcount); - TblMapMtx->lock(); - Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL), - loop_tripcount); - TblMapMtx->unlock(); -} +//===-------- interface.cpp - Target independent OpenMP target RTL --------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of the interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#include + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include +#include +#include + +// Store target policy (disabled, mandatory, default) +kmp_target_offload_kind_t TargetOffloadPolicy = tgt_default; +std::mutex TargetOffloadMtx; + +//////////////////////////////////////////////////////////////////////////////// +/// manage the success or failure of a target construct + +static void HandleDefaultTargetOffload() { + TargetOffloadMtx.lock(); + if (TargetOffloadPolicy == tgt_default) { + if (omp_get_num_devices() > 0) { + DP("Default TARGET OFFLOAD policy is now mandatory " + "(devices were found)\n"); + TargetOffloadPolicy = tgt_mandatory; + } else { + DP("Default TARGET OFFLOAD policy is now disabled " + "(no devices were found)\n"); + TargetOffloadPolicy = tgt_disabled; + } + } + TargetOffloadMtx.unlock(); +} + +static int IsOffloadDisabled() { + if (TargetOffloadPolicy == tgt_default) HandleDefaultTargetOffload(); + return TargetOffloadPolicy == tgt_disabled; +} + +static void HandleTargetOutcome(bool success) { + switch (TargetOffloadPolicy) { + case tgt_disabled: + if (success) { + FATAL_MESSAGE0(1, "expected no offloading while offloading is disabled"); + } + break; + case tgt_default: + FATAL_MESSAGE0(1, "default offloading policy must be switched to " + "mandatory or disabled"); + break; + case tgt_mandatory: + if (!success) { + FATAL_MESSAGE0(1, "failure of target construct while offloading is mandatory"); + } + break; + } +} + +//////////////////////////////////////////////////////////////////////////////// +/// adds requires flags +EXTERN void __tgt_register_requires(int64_t flags) { + RTLs->RegisterRequires(flags); +} + +//////////////////////////////////////////////////////////////////////////////// +/// adds a target shared library to the target execution image +EXTERN void __tgt_register_lib(__tgt_bin_desc *desc) { + RTLs->RegisterLib(desc); +} + +//////////////////////////////////////////////////////////////////////////////// +/// unloads a target shared library +EXTERN void __tgt_unregister_lib(__tgt_bin_desc *desc) { + RTLs->UnregisterLib(desc); +} + +/// creates host-to-target data mapping, stores it in the +/// libomptarget.so internal structure (an entry in a stack of data maps) +/// and passes the data to the device. +EXTERN void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return; + + DP("Entering data begin region for device %" PRId64 " with %d mappings\n", + device_id, arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + DP("Use default device id %" PRId64 "\n", device_id); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy &Device = Devices[device_id]; + +#ifdef OMPTARGET_DEBUG + for (int i = 0; i < arg_num; ++i) { + DP("Entry %2d: Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 "\n", + i, DPxPTR(args_base[i]), DPxPTR(args[i]), arg_sizes[i], arg_types[i]); + } +#endif + + int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, + arg_types, nullptr); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +EXTERN void __tgt_target_data_begin_nowait(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t depNum, void *depList, int32_t noAliasDepNum, + void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +/// passes data from the target, releases target memory and destroys +/// the host-target mapping (top entry from the stack of data maps) +/// created by the last __tgt_target_data_begin. +EXTERN void __tgt_target_data_end(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return; + DP("Entering data end region with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + RTLsMtx->lock(); + size_t Devices_size = Devices.size(); + RTLsMtx->unlock(); + if (Devices_size <= (size_t)device_id) { + DP("Device ID %" PRId64 " does not have a matching RTL.\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy &Device = Devices[device_id]; + if (!Device.IsInit) { + DP("Uninit device: ignore"); + HandleTargetOutcome(false); + return; + } + +#ifdef OMPTARGET_DEBUG + for (int i=0; i 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN void __tgt_target_data_update(int64_t device_id, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return; + DP("Entering data update with %d mappings\n", arg_num); + + // No devices available? + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DeviceTy& Device = Devices[device_id]; + int rc = target_data_update(Device, arg_num, args_base, + args, arg_sizes, arg_types); + HandleTargetOutcome(rc == OFFLOAD_SUCCESS); +} + +EXTERN void __tgt_target_data_update_nowait( + int64_t device_id, int32_t arg_num, void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types, int32_t depNum, void *depList, + int32_t noAliasDepNum, void *noAliasDepList) { + if (depNum + noAliasDepNum > 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + __tgt_target_data_update(device_id, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN int __tgt_target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + if (IsOffloadDisabled()) return OFFLOAD_FAIL; + DP("Entering target region with entry point " DPxMOD " and device Id %" + PRId64 "\n", DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return OFFLOAD_FAIL; + } + +#ifdef OMPTARGET_DEBUG + for (int i=0; i 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + return __tgt_target(device_id, host_ptr, arg_num, args_base, args, arg_sizes, + arg_types); +} + +EXTERN int __tgt_target_teams(int64_t device_id, void *host_ptr, + int32_t arg_num, void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, int32_t team_num, int32_t thread_limit) { + if (IsOffloadDisabled()) return OFFLOAD_FAIL; + DP("Entering target region with entry point " DPxMOD " and device Id %" + PRId64 "\n", DPxPTR(host_ptr), device_id); + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return OFFLOAD_FAIL; + } + +#ifdef OMPTARGET_DEBUG + for (int i=0; i 0) + __kmpc_omp_taskwait(NULL, __kmpc_global_thread_num(NULL)); + + return __tgt_target_teams(device_id, host_ptr, arg_num, args_base, args, + arg_sizes, arg_types, team_num, thread_limit); +} + +// Get the current number of components for a user-defined mapper. +EXTERN int64_t __tgt_mapper_num_components(void *rt_mapper_handle) { + auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; + int64_t size = MapperComponentsPtr->Components.size(); + DP("__tgt_mapper_num_components(Handle=" DPxMOD ") returns %" PRId64 "\n", + DPxPTR(rt_mapper_handle), size); + return size; +} + +// Push back one component for a user-defined mapper. +EXTERN void __tgt_push_mapper_component(void *rt_mapper_handle, void *base, + void *begin, int64_t size, + int64_t type) { + DP("__tgt_push_mapper_component(Handle=" DPxMOD + ") adds an entry (Base=" DPxMOD ", Begin=" DPxMOD ", Size=%" PRId64 + ", Type=0x%" PRIx64 ").\n", + DPxPTR(rt_mapper_handle), DPxPTR(base), DPxPTR(begin), size, type); + auto *MapperComponentsPtr = (struct MapperComponentsTy *)rt_mapper_handle; + MapperComponentsPtr->Components.push_back( + MapComponentInfoTy(base, begin, size, type)); +} + +EXTERN void __kmpc_push_target_tripcount(int64_t device_id, + uint64_t loop_tripcount) { + if (IsOffloadDisabled()) + return; + + if (device_id == OFFLOAD_DEVICE_DEFAULT) { + device_id = omp_get_default_device(); + } + + if (CheckDeviceAndCtors(device_id) != OFFLOAD_SUCCESS) { + DP("Failed to get device %" PRId64 " ready\n", device_id); + HandleTargetOutcome(false); + return; + } + + DP("__kmpc_push_target_tripcount(%" PRId64 ", %" PRIu64 ")\n", device_id, + loop_tripcount); + TblMapMtx->lock(); + Devices[device_id].LoopTripCnt.emplace(__kmpc_global_thread_num(NULL), + loop_tripcount); + TblMapMtx->unlock(); +} diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp index 3113bdc2a9d39..ea6ca336d1c0b 100644 --- a/openmp/libomptarget/src/omptarget.cpp +++ b/openmp/libomptarget/src/omptarget.cpp @@ -1,823 +1,823 @@ -//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Implementation of the interface to be used by Clang during the codegen of a -// target region. -// -//===----------------------------------------------------------------------===// - -#include - -#include "device.h" -#include "private.h" -#include "rtl.h" - -#include -#include - -#ifdef OMPTARGET_DEBUG -int DebugLevel = 0; -#endif // OMPTARGET_DEBUG - - - -/* All begin addresses for partially mapped structs must be 8-aligned in order - * to ensure proper alignment of members. E.g. - * - * struct S { - * int a; // 4-aligned - * int b; // 4-aligned - * int *p; // 8-aligned - * } s1; - * ... - * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) - * { - * s1.b = 5; - * for (int i...) s1.p[i] = ...; - * } - * - * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and - * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, - * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment - * requirements for its type. Now, when we allocate memory on the device, in - * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. - * This means that the chunk of the struct on the device will start at a - * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and - * address of p will be a misaligned 0x204 (on the host there was no need to add - * padding between b and p, so p comes exactly 4 bytes after b). If the device - * kernel tries to access s1.p, a misaligned address error occurs (as reported - * by the CUDA plugin). By padding the begin address down to a multiple of 8 and - * extending the size of the allocated chuck accordingly, the chuck on the - * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and - * &s1.p=0x208, as they should be to satisfy the alignment requirements. - */ -static const int64_t alignment = 8; - -/// Map global data and execute pending ctors -static int InitLibrary(DeviceTy& Device) { - /* - * Map global data - */ - int32_t device_id = Device.DeviceID; - int rc = OFFLOAD_SUCCESS; - - Device.PendingGlobalsMtx.lock(); - TrlTblMtx->lock(); - for (HostEntriesBeginToTransTableTy::iterator - ii = HostEntriesBeginToTransTable->begin(); - ii != HostEntriesBeginToTransTable->end(); ++ii) { - TranslationTable *TransTable = &ii->second; - if (TransTable->HostTable.EntriesBegin == - TransTable->HostTable.EntriesEnd) { - // No host entry so no need to proceed - continue; - } - if (TransTable->TargetsTable[device_id] != 0) { - // Library entries have already been processed - continue; - } - - // 1) get image. - assert(TransTable->TargetsImages.size() > (size_t)device_id && - "Not expecting a device ID outside the table's bounds!"); - __tgt_device_image *img = TransTable->TargetsImages[device_id]; - if (!img) { - DP("No image loaded for device id %d.\n", device_id); - rc = OFFLOAD_FAIL; - break; - } - // 2) load image into the target table. - __tgt_target_table *TargetTable = - TransTable->TargetsTable[device_id] = Device.load_binary(img); - // Unable to get table for this image: invalidate image and fail. - if (!TargetTable) { - DP("Unable to generate entries table for device id %d.\n", device_id); - TransTable->TargetsImages[device_id] = 0; - rc = OFFLOAD_FAIL; - break; - } - - // Verify whether the two table sizes match. - size_t hsize = - TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; - size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; - - // Invalid image for these host entries! - if (hsize != tsize) { - DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n", - device_id, hsize, tsize); - TransTable->TargetsImages[device_id] = 0; - TransTable->TargetsTable[device_id] = 0; - rc = OFFLOAD_FAIL; - break; - } - - // process global data that needs to be mapped. - Device.DataMapMtx.lock(); - __tgt_target_table *HostTable = &TransTable->HostTable; - for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, - *CurrHostEntry = HostTable->EntriesBegin, - *EntryDeviceEnd = TargetTable->EntriesEnd; - CurrDeviceEntry != EntryDeviceEnd; - CurrDeviceEntry++, CurrHostEntry++) { - if (CurrDeviceEntry->size != 0) { - // has data. - assert(CurrDeviceEntry->size == CurrHostEntry->size && - "data size mismatch"); - - // Fortran may use multiple weak declarations for the same symbol, - // therefore we must allow for multiple weak symbols to be loaded from - // the fat binary. Treat these mappings as any other "regular" mapping. - // Add entry to map. - if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) - continue; - DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" - "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), - CurrDeviceEntry->size); - Device.HostDataToTargetMap.push_front(HostDataToTargetTy( - (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, - (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, - (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, - (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, - true /*IsRefCountINF*/)); - } - } - Device.DataMapMtx.unlock(); - } - TrlTblMtx->unlock(); - - if (rc != OFFLOAD_SUCCESS) { - Device.PendingGlobalsMtx.unlock(); - return rc; - } - - /* - * Run ctors for static objects - */ - if (!Device.PendingCtorsDtors.empty()) { - // Call all ctors for all libraries registered so far - for (auto &lib : Device.PendingCtorsDtors) { - if (!lib.second.PendingCtors.empty()) { - DP("Has pending ctors... call now\n"); - for (auto &entry : lib.second.PendingCtors) { - void *ctor = entry; - int rc = target(device_id, ctor, 0, NULL, NULL, NULL, - NULL, 1, 1, true /*team*/); - if (rc != OFFLOAD_SUCCESS) { - DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); - Device.PendingGlobalsMtx.unlock(); - return OFFLOAD_FAIL; - } - } - // Clear the list to indicate that this device has been used - lib.second.PendingCtors.clear(); - DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); - } - } - } - Device.HasPendingGlobals = false; - Device.PendingGlobalsMtx.unlock(); - - return OFFLOAD_SUCCESS; -} - -// Check whether a device has been initialized, global ctors have been -// executed and global data has been mapped; do so if not already done. -int CheckDeviceAndCtors(int64_t device_id) { - // Is device ready? - if (!device_is_ready(device_id)) { - DP("Device %" PRId64 " is not ready.\n", device_id); - return OFFLOAD_FAIL; - } - - // Get device info. - DeviceTy &Device = Devices[device_id]; - - // Check whether global data has been mapped for this device - Device.PendingGlobalsMtx.lock(); - bool hasPendingGlobals = Device.HasPendingGlobals; - Device.PendingGlobalsMtx.unlock(); - if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { - DP("Failed to init globals on device %" PRId64 "\n", device_id); - return OFFLOAD_FAIL; - } - - return OFFLOAD_SUCCESS; -} - -static int32_t member_of(int64_t type) { - return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; -} - -/// Internal function to do the mapping and transfer the data to the device -int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr) { - // process each input. - for (int32_t i = 0; i < arg_num; ++i) { - // Ignore private variables and arrays - there is no mapping for them. - if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || - (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) - continue; - - void *HstPtrBegin = args[i]; - void *HstPtrBase = args_base[i]; - int64_t data_size = arg_sizes[i]; - - // Adjust for proper alignment if this is a combined entry (for structs). - // Look at the next argument - if that is MEMBER_OF this one, then this one - // is a combined entry. - int64_t padding = 0; - const int next_i = i+1; - if (member_of(arg_types[i]) < 0 && next_i < arg_num && - member_of(arg_types[next_i]) == i) { - padding = (int64_t)HstPtrBegin % alignment; - if (padding) { - DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD - "\n", padding, DPxPTR(HstPtrBegin)); - HstPtrBegin = (char *) HstPtrBegin - padding; - data_size += padding; - } - } - - // Address of pointer on the host and device, respectively. - void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; - bool IsNew, Pointer_IsNew; - bool IsHostPtr = false; - bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; - // Force the creation of a device side copy of the data when: - // a close map modifier was associated with a map that contained a to. - bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; - // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we - // have reached this point via __tgt_target_data_begin and not __tgt_target - // then no argument is marked as TARGET_PARAM ("omp target data map" is not - // associated with a target region, so there are no target parameters). This - // may be considered a hack, we could revise the scheme in the future. - bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); - if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { - DP("Has a pointer entry: \n"); - // base is address of pointer. - Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, - sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef, - HasCloseModifier); - if (!Pointer_TgtPtrBegin) { - DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " - "illegal mapping).\n"); - return OFFLOAD_FAIL; - } - DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" - "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), - (Pointer_IsNew ? "" : " not")); - Pointer_HstPtrBegin = HstPtrBase; - // modify current entry. - HstPtrBase = *(void **)HstPtrBase; - UpdateRef = true; // subsequently update ref count of pointee - } - - void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, - data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier); - if (!TgtPtrBegin && data_size) { - // If data_size==0, then the argument could be a zero-length pointer to - // NULL, so getOrAlloc() returning NULL is not an error. - DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " - "illegal mapping).\n"); - } - DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), - (IsNew ? "" : " not")); - - if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { - uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; - void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); - DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); - args_base[i] = TgtPtrBase; - } - - if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - bool copy = false; - if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || - HasCloseModifier) { - if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { - copy = true; - } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { - // Copy data only if the "parent" struct has RefCount==1. - int32_t parent_idx = member_of(arg_types[i]); - uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); - assert(parent_rc > 0 && "parent struct not found"); - if (parent_rc == 1) { - copy = true; - } - } - } - - if (copy && !IsHostPtr) { - DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size, - async_info_ptr); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data to device failed.\n"); - return OFFLOAD_FAIL; - } - } - } - - if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { - DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", - DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); - uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; - void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); - int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, - sizeof(void *), async_info_ptr); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data to device failed.\n"); - return OFFLOAD_FAIL; - } - // create shadow pointers for this entry - Device.ShadowMtx.lock(); - Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase, - Pointer_TgtPtrBegin, TgtPtrBase}; - Device.ShadowMtx.unlock(); - } - } - - return OFFLOAD_SUCCESS; -} - -/// Internal function to undo the mapping and retrieve the data from the device. -int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr) { - // process each input. - for (int32_t i = arg_num - 1; i >= 0; --i) { - // Ignore private variables and arrays - there is no mapping for them. - // Also, ignore the use_device_ptr directive, it has no effect here. - if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || - (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) - continue; - - void *HstPtrBegin = args[i]; - int64_t data_size = arg_sizes[i]; - // Adjust for proper alignment if this is a combined entry (for structs). - // Look at the next argument - if that is MEMBER_OF this one, then this one - // is a combined entry. - int64_t padding = 0; - const int next_i = i+1; - if (member_of(arg_types[i]) < 0 && next_i < arg_num && - member_of(arg_types[next_i]) == i) { - padding = (int64_t)HstPtrBegin % alignment; - if (padding) { - DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD - "\n", padding, DPxPTR(HstPtrBegin)); - HstPtrBegin = (char *) HstPtrBegin - padding; - data_size += padding; - } - } - - bool IsLast, IsHostPtr; - bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || - (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); - bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; - bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; - - // If PTR_AND_OBJ, HstPtrBegin is address of pointee - void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, - UpdateRef, IsHostPtr); - DP("There are %" PRId64 " bytes allocated at target address " DPxMOD - " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), - (IsLast ? "" : " not")); - - bool DelEntry = IsLast || ForceDelete; - - if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && - !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { - DelEntry = false; // protect parent struct from being deallocated - } - - if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { - // Move data back to the host - if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { - bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; - bool CopyMember = false; - if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || - HasCloseModifier) { - if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && - !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { - // Copy data only if the "parent" struct has RefCount==1. - int32_t parent_idx = member_of(arg_types[i]); - uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); - assert(parent_rc > 0 && "parent struct not found"); - if (parent_rc == 1) { - CopyMember = true; - } - } - } - - if ((DelEntry || Always || CopyMember) && - !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - TgtPtrBegin == HstPtrBegin)) { - DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size, - async_info_ptr); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data from device failed.\n"); - return OFFLOAD_FAIL; - } - } - } - - // If we copied back to the host a struct/array containing pointers, we - // need to restore the original host pointer values from their shadow - // copies. If the struct is going to be deallocated, remove any remaining - // shadow pointer entries for this struct. - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; - Device.ShadowMtx.lock(); - for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end();) { - void **ShadowHstPtrAddr = (void**) it->first; - - // An STL map is sorted on its keys; use this property - // to quickly determine when to break out of the loop. - if ((uintptr_t) ShadowHstPtrAddr < lb) { - ++it; - continue; - } - if ((uintptr_t) ShadowHstPtrAddr >= ub) - break; - - // If we copied the struct to the host, we need to restore the pointer. - if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { - DP("Restoring original host pointer value " DPxMOD " for host " - "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal), - DPxPTR(ShadowHstPtrAddr)); - *ShadowHstPtrAddr = it->second.HstPtrVal; - } - // If the struct is to be deallocated, remove the shadow entry. - if (DelEntry) { - DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); - it = Device.ShadowPtrMap.erase(it); - } else { - ++it; - } - } - Device.ShadowMtx.unlock(); - - // Deallocate map - if (DelEntry) { - int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete, - HasCloseModifier); - if (rt != OFFLOAD_SUCCESS) { - DP("Deallocating data from device failed.\n"); - return OFFLOAD_FAIL; - } - } - } - } - - return OFFLOAD_SUCCESS; -} - -/// Internal function to pass data to/from the target. -int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { - // process each input. - for (int32_t i = 0; i < arg_num; ++i) { - if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || - (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) - continue; - - void *HstPtrBegin = args[i]; - int64_t MapSize = arg_sizes[i]; - bool IsLast, IsHostPtr; - void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, - false, IsHostPtr); - if (!TgtPtrBegin) { - DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); - continue; - } - - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - TgtPtrBegin == HstPtrBegin) { - DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", - DPxPTR(HstPtrBegin)); - continue; - } - - if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { - DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); - int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data from device failed.\n"); - return OFFLOAD_FAIL; - } - - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; - Device.ShadowMtx.lock(); - for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void**) it->first; - if ((uintptr_t) ShadowHstPtrAddr < lb) - continue; - if ((uintptr_t) ShadowHstPtrAddr >= ub) - break; - DP("Restoring original host pointer value " DPxMOD " for host pointer " - DPxMOD "\n", DPxPTR(it->second.HstPtrVal), - DPxPTR(ShadowHstPtrAddr)); - *ShadowHstPtrAddr = it->second.HstPtrVal; - } - Device.ShadowMtx.unlock(); - } - - if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", - arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data to device failed.\n"); - return OFFLOAD_FAIL; - } - - uintptr_t lb = (uintptr_t) HstPtrBegin; - uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; - Device.ShadowMtx.lock(); - for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); - it != Device.ShadowPtrMap.end(); ++it) { - void **ShadowHstPtrAddr = (void**) it->first; - if ((uintptr_t) ShadowHstPtrAddr < lb) - continue; - if ((uintptr_t) ShadowHstPtrAddr >= ub) - break; - DP("Restoring original target pointer value " DPxMOD " for target " - "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), - DPxPTR(it->second.TgtPtrAddr)); - rt = Device.data_submit(it->second.TgtPtrAddr, - &it->second.TgtPtrVal, sizeof(void *), nullptr); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data to device failed.\n"); - Device.ShadowMtx.unlock(); - return OFFLOAD_FAIL; - } - } - Device.ShadowMtx.unlock(); - } - } - return OFFLOAD_SUCCESS; -} - -static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | - OMP_TGT_MAPTYPE_LITERAL | - OMP_TGT_MAPTYPE_IMPLICIT; -static bool isLambdaMapping(int64_t Mapping) { - return (Mapping & LambdaMapping) == LambdaMapping; -} - -/// performs the same actions as data_begin in case arg_num is -/// non-zero and initiates run of the offloaded region on the target platform; -/// if arg_num is non-zero after the region execution is done it also -/// performs the same action as data_update and data_end above. This function -/// returns 0 if it was able to transfer the execution to a target and an -/// integer different from zero otherwise. -int target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t team_num, int32_t thread_limit, int IsTeamConstruct) { - DeviceTy &Device = Devices[device_id]; - - // Find the table information in the map or look it up in the translation - // tables. - TableMap *TM = 0; - TblMapMtx->lock(); - HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(host_ptr); - if (TableMapIt == HostPtrToTableMap->end()) { - // We don't have a map. So search all the registered libraries. - TrlTblMtx->lock(); - for (HostEntriesBeginToTransTableTy::iterator - ii = HostEntriesBeginToTransTable->begin(), - ie = HostEntriesBeginToTransTable->end(); - !TM && ii != ie; ++ii) { - // get the translation table (which contains all the good info). - TranslationTable *TransTable = &ii->second; - // iterate over all the host table entries to see if we can locate the - // host_ptr. - __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; - __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; - __tgt_offload_entry *cur = begin; - for (uint32_t i = 0; cur < end; ++cur, ++i) { - if (cur->addr != host_ptr) - continue; - // we got a match, now fill the HostPtrToTableMap so that we - // may avoid this search next time. - TM = &(*HostPtrToTableMap)[host_ptr]; - TM->Table = TransTable; - TM->Index = i; - break; - } - } - TrlTblMtx->unlock(); - } else { - TM = &TableMapIt->second; - } - TblMapMtx->unlock(); - - // No map for this host pointer found! - if (!TM) { - DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", - DPxPTR(host_ptr)); - return OFFLOAD_FAIL; - } - - // get target table. - TrlTblMtx->lock(); - assert(TM->Table->TargetsTable.size() > (size_t)device_id && - "Not expecting a device ID outside the table's bounds!"); - __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; - TrlTblMtx->unlock(); - assert(TargetTable && "Global data has not been mapped\n"); - - __tgt_async_info AsyncInfo; - - // Move data to device. - int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, - arg_types, &AsyncInfo); - if (rc != OFFLOAD_SUCCESS) { - DP("Call to target_data_begin failed, abort target.\n"); - return OFFLOAD_FAIL; - } - - std::vector tgt_args; - std::vector tgt_offsets; - - // List of (first-)private arrays allocated for this target region - std::vector fpArrays; - std::vector tgtArgsPositions(arg_num, -1); - - for (int32_t i = 0; i < arg_num; ++i) { - if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { - // This is not a target parameter, do not push it into tgt_args. - // Check for lambda mapping. - if (isLambdaMapping(arg_types[i])) { - assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && - "PTR_AND_OBJ must be also MEMBER_OF."); - unsigned idx = member_of(arg_types[i]); - int tgtIdx = tgtArgsPositions[idx]; - assert(tgtIdx != -1 && "Base address must be translated already."); - // The parent lambda must be processed already and it must be the last - // in tgt_args and tgt_offsets arrays. - void *HstPtrVal = args[i]; - void *HstPtrBegin = args_base[i]; - void *HstPtrBase = args[idx]; - bool IsLast, IsHostPtr; // unused. - void *TgtPtrBase = - (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]); - DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); - uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; - void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); - void *Pointer_TgtPtrBegin = - Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false, - IsHostPtr); - if (!Pointer_TgtPtrBegin) { - DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", - DPxPTR(HstPtrVal)); - continue; - } - if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && - TgtPtrBegin == HstPtrBegin) { - DP("Unified memory is active, no need to map lambda captured" - "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); - continue; - } - DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", - DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); - int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, - sizeof(void *), &AsyncInfo); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data to device failed.\n"); - return OFFLOAD_FAIL; - } - } - continue; - } - void *HstPtrBegin = args[i]; - void *HstPtrBase = args_base[i]; - void *TgtPtrBegin; - ptrdiff_t TgtBaseOffset; - bool IsLast, IsHostPtr; // unused. - if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { - DP("Forwarding first-private value " DPxMOD " to the target construct\n", - DPxPTR(HstPtrBase)); - TgtPtrBegin = HstPtrBase; - TgtBaseOffset = 0; - } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { - // Allocate memory for (first-)private array - TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, - arg_sizes[i], HstPtrBegin); - if (!TgtPtrBegin) { - DP ("Data allocation for %sprivate array " DPxMOD " failed, " - "abort target.\n", - (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin)); - return OFFLOAD_FAIL; - } - fpArrays.push_back(TgtPtrBegin); - TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; -#ifdef OMPTARGET_DEBUG - void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); - DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " - "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", - arg_sizes[i], DPxPTR(TgtPtrBegin), - (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), - DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); -#endif - // If first-private, copy data from host - if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { - int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i], - &AsyncInfo); - if (rt != OFFLOAD_SUCCESS) { - DP("Copying data to device failed, failed.\n"); - return OFFLOAD_FAIL; - } - } - } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { - TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, - false, IsHostPtr); - TgtBaseOffset = 0; // no offset for ptrs. - DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " - "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), - DPxPTR(HstPtrBase)); - } else { - TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, - false, IsHostPtr); - TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; -#ifdef OMPTARGET_DEBUG - void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); - DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", - DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); -#endif - } - tgtArgsPositions[i] = tgt_args.size(); - tgt_args.push_back(TgtPtrBegin); - tgt_offsets.push_back(TgtBaseOffset); - } - - assert(tgt_args.size() == tgt_offsets.size() && - "Size mismatch in arguments and offsets"); - - // Pop loop trip count - uint64_t ltc = 0; - TblMapMtx->lock(); - auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); - if (I != Device.LoopTripCnt.end()) { - ltc = I->second; - Device.LoopTripCnt.erase(I); - DP("loop trip count is %lu.\n", ltc); - } - TblMapMtx->unlock(); - - // Launch device execution. - DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", - TargetTable->EntriesBegin[TM->Index].name, - DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); - if (IsTeamConstruct) { - rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size(), - team_num, thread_limit, ltc, &AsyncInfo); - } else { - rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, - &tgt_args[0], &tgt_offsets[0], tgt_args.size(), - &AsyncInfo); - } - if (rc != OFFLOAD_SUCCESS) { - DP ("Executing target region abort target.\n"); - return OFFLOAD_FAIL; - } - - // Deallocate (first-)private arrays - for (auto it : fpArrays) { - int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); - if (rt != OFFLOAD_SUCCESS) { - DP("Deallocation of (first-)private arrays failed.\n"); - return OFFLOAD_FAIL; - } - } - - // Move data from device. - int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, - arg_types, &AsyncInfo); - if (rt != OFFLOAD_SUCCESS) { - DP("Call to target_data_end failed, abort targe.\n"); - return OFFLOAD_FAIL; - } - - if (Device.RTL->synchronize) - return Device.RTL->synchronize(device_id, &AsyncInfo); - - return OFFLOAD_SUCCESS; -} +//===------ omptarget.cpp - Target independent OpenMP target RTL -- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Implementation of the interface to be used by Clang during the codegen of a +// target region. +// +//===----------------------------------------------------------------------===// + +#include + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include +#include + +#ifdef OMPTARGET_DEBUG +int DebugLevel = 0; +#endif // OMPTARGET_DEBUG + + + +/* All begin addresses for partially mapped structs must be 8-aligned in order + * to ensure proper alignment of members. E.g. + * + * struct S { + * int a; // 4-aligned + * int b; // 4-aligned + * int *p; // 8-aligned + * } s1; + * ... + * #pragma omp target map(tofrom: s1.b, s1.p[0:N]) + * { + * s1.b = 5; + * for (int i...) s1.p[i] = ...; + * } + * + * Here we are mapping s1 starting from member b, so BaseAddress=&s1=&s1.a and + * BeginAddress=&s1.b. Let's assume that the struct begins at address 0x100, + * then &s1.a=0x100, &s1.b=0x104, &s1.p=0x108. Each member obeys the alignment + * requirements for its type. Now, when we allocate memory on the device, in + * CUDA's case cuMemAlloc() returns an address which is at least 256-aligned. + * This means that the chunk of the struct on the device will start at a + * 256-aligned address, let's say 0x200. Then the address of b will be 0x200 and + * address of p will be a misaligned 0x204 (on the host there was no need to add + * padding between b and p, so p comes exactly 4 bytes after b). If the device + * kernel tries to access s1.p, a misaligned address error occurs (as reported + * by the CUDA plugin). By padding the begin address down to a multiple of 8 and + * extending the size of the allocated chuck accordingly, the chuck on the + * device will start at 0x200 with the padding (4 bytes), then &s1.b=0x204 and + * &s1.p=0x208, as they should be to satisfy the alignment requirements. + */ +static const int64_t alignment = 8; + +/// Map global data and execute pending ctors +static int InitLibrary(DeviceTy& Device) { + /* + * Map global data + */ + int32_t device_id = Device.DeviceID; + int rc = OFFLOAD_SUCCESS; + + Device.PendingGlobalsMtx.lock(); + TrlTblMtx->lock(); + for (HostEntriesBeginToTransTableTy::iterator + ii = HostEntriesBeginToTransTable->begin(); + ii != HostEntriesBeginToTransTable->end(); ++ii) { + TranslationTable *TransTable = &ii->second; + if (TransTable->HostTable.EntriesBegin == + TransTable->HostTable.EntriesEnd) { + // No host entry so no need to proceed + continue; + } + if (TransTable->TargetsTable[device_id] != 0) { + // Library entries have already been processed + continue; + } + + // 1) get image. + assert(TransTable->TargetsImages.size() > (size_t)device_id && + "Not expecting a device ID outside the table's bounds!"); + __tgt_device_image *img = TransTable->TargetsImages[device_id]; + if (!img) { + DP("No image loaded for device id %d.\n", device_id); + rc = OFFLOAD_FAIL; + break; + } + // 2) load image into the target table. + __tgt_target_table *TargetTable = + TransTable->TargetsTable[device_id] = Device.load_binary(img); + // Unable to get table for this image: invalidate image and fail. + if (!TargetTable) { + DP("Unable to generate entries table for device id %d.\n", device_id); + TransTable->TargetsImages[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // Verify whether the two table sizes match. + size_t hsize = + TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin; + size_t tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin; + + // Invalid image for these host entries! + if (hsize != tsize) { + DP("Host and Target tables mismatch for device id %d [%zx != %zx].\n", + device_id, hsize, tsize); + TransTable->TargetsImages[device_id] = 0; + TransTable->TargetsTable[device_id] = 0; + rc = OFFLOAD_FAIL; + break; + } + + // process global data that needs to be mapped. + Device.DataMapMtx.lock(); + __tgt_target_table *HostTable = &TransTable->HostTable; + for (__tgt_offload_entry *CurrDeviceEntry = TargetTable->EntriesBegin, + *CurrHostEntry = HostTable->EntriesBegin, + *EntryDeviceEnd = TargetTable->EntriesEnd; + CurrDeviceEntry != EntryDeviceEnd; + CurrDeviceEntry++, CurrHostEntry++) { + if (CurrDeviceEntry->size != 0) { + // has data. + assert(CurrDeviceEntry->size == CurrHostEntry->size && + "data size mismatch"); + + // Fortran may use multiple weak declarations for the same symbol, + // therefore we must allow for multiple weak symbols to be loaded from + // the fat binary. Treat these mappings as any other "regular" mapping. + // Add entry to map. + if (Device.getTgtPtrBegin(CurrHostEntry->addr, CurrHostEntry->size)) + continue; + DP("Add mapping from host " DPxMOD " to device " DPxMOD " with size %zu" + "\n", DPxPTR(CurrHostEntry->addr), DPxPTR(CurrDeviceEntry->addr), + CurrDeviceEntry->size); + Device.HostDataToTargetMap.push_front(HostDataToTargetTy( + (uintptr_t)CurrHostEntry->addr /*HstPtrBase*/, + (uintptr_t)CurrHostEntry->addr /*HstPtrBegin*/, + (uintptr_t)CurrHostEntry->addr + CurrHostEntry->size /*HstPtrEnd*/, + (uintptr_t)CurrDeviceEntry->addr /*TgtPtrBegin*/, + true /*IsRefCountINF*/)); + } + } + Device.DataMapMtx.unlock(); + } + TrlTblMtx->unlock(); + + if (rc != OFFLOAD_SUCCESS) { + Device.PendingGlobalsMtx.unlock(); + return rc; + } + + /* + * Run ctors for static objects + */ + if (!Device.PendingCtorsDtors.empty()) { + // Call all ctors for all libraries registered so far + for (auto &lib : Device.PendingCtorsDtors) { + if (!lib.second.PendingCtors.empty()) { + DP("Has pending ctors... call now\n"); + for (auto &entry : lib.second.PendingCtors) { + void *ctor = entry; + int rc = target(device_id, ctor, 0, NULL, NULL, NULL, + NULL, 1, 1, true /*team*/); + if (rc != OFFLOAD_SUCCESS) { + DP("Running ctor " DPxMOD " failed.\n", DPxPTR(ctor)); + Device.PendingGlobalsMtx.unlock(); + return OFFLOAD_FAIL; + } + } + // Clear the list to indicate that this device has been used + lib.second.PendingCtors.clear(); + DP("Done with pending ctors for lib " DPxMOD "\n", DPxPTR(lib.first)); + } + } + } + Device.HasPendingGlobals = false; + Device.PendingGlobalsMtx.unlock(); + + return OFFLOAD_SUCCESS; +} + +// Check whether a device has been initialized, global ctors have been +// executed and global data has been mapped; do so if not already done. +int CheckDeviceAndCtors(int64_t device_id) { + // Is device ready? + if (!device_is_ready(device_id)) { + DP("Device %" PRId64 " is not ready.\n", device_id); + return OFFLOAD_FAIL; + } + + // Get device info. + DeviceTy &Device = Devices[device_id]; + + // Check whether global data has been mapped for this device + Device.PendingGlobalsMtx.lock(); + bool hasPendingGlobals = Device.HasPendingGlobals; + Device.PendingGlobalsMtx.unlock(); + if (hasPendingGlobals && InitLibrary(Device) != OFFLOAD_SUCCESS) { + DP("Failed to init globals on device %" PRId64 "\n", device_id); + return OFFLOAD_FAIL; + } + + return OFFLOAD_SUCCESS; +} + +static int32_t member_of(int64_t type) { + return ((type & OMP_TGT_MAPTYPE_MEMBER_OF) >> 48) - 1; +} + +/// Internal function to do the mapping and transfer the data to the device +int target_data_begin(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { + // process each input. + for (int32_t i = 0; i < arg_num; ++i) { + // Ignore private variables and arrays - there is no mapping for them. + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + void *HstPtrBase = args_base[i]; + int64_t data_size = arg_sizes[i]; + + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + + // Address of pointer on the host and device, respectively. + void *Pointer_HstPtrBegin, *Pointer_TgtPtrBegin; + bool IsNew, Pointer_IsNew; + bool IsHostPtr = false; + bool IsImplicit = arg_types[i] & OMP_TGT_MAPTYPE_IMPLICIT; + // Force the creation of a device side copy of the data when: + // a close map modifier was associated with a map that contained a to. + bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; + // UpdateRef is based on MEMBER_OF instead of TARGET_PARAM because if we + // have reached this point via __tgt_target_data_begin and not __tgt_target + // then no argument is marked as TARGET_PARAM ("omp target data map" is not + // associated with a target region, so there are no target parameters). This + // may be considered a hack, we could revise the scheme in the future. + bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF); + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + DP("Has a pointer entry: \n"); + // base is address of pointer. + Pointer_TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBase, HstPtrBase, + sizeof(void *), Pointer_IsNew, IsHostPtr, IsImplicit, UpdateRef, + HasCloseModifier); + if (!Pointer_TgtPtrBegin) { + DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " + "illegal mapping).\n"); + return OFFLOAD_FAIL; + } + DP("There are %zu bytes allocated at target address " DPxMOD " - is%s new" + "\n", sizeof(void *), DPxPTR(Pointer_TgtPtrBegin), + (Pointer_IsNew ? "" : " not")); + Pointer_HstPtrBegin = HstPtrBase; + // modify current entry. + HstPtrBase = *(void **)HstPtrBase; + UpdateRef = true; // subsequently update ref count of pointee + } + + void *TgtPtrBegin = Device.getOrAllocTgtPtr(HstPtrBegin, HstPtrBase, + data_size, IsNew, IsHostPtr, IsImplicit, UpdateRef, HasCloseModifier); + if (!TgtPtrBegin && data_size) { + // If data_size==0, then the argument could be a zero-length pointer to + // NULL, so getOrAlloc() returning NULL is not an error. + DP("Call to getOrAllocTgtPtr returned null pointer (device failure or " + "illegal mapping).\n"); + } + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD + " - is%s new\n", data_size, DPxPTR(TgtPtrBegin), + (IsNew ? "" : " not")); + + if (arg_types[i] & OMP_TGT_MAPTYPE_RETURN_PARAM) { + uintptr_t Delta = (uintptr_t)HstPtrBegin - (uintptr_t)HstPtrBase; + void *TgtPtrBase = (void *)((uintptr_t)TgtPtrBegin - Delta); + DP("Returning device pointer " DPxMOD "\n", DPxPTR(TgtPtrBase)); + args_base[i] = TgtPtrBase; + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + bool copy = false; + if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || + HasCloseModifier) { + if (IsNew || (arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS)) { + copy = true; + } else if (arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) { + // Copy data only if the "parent" struct has RefCount==1. + int32_t parent_idx = member_of(arg_types[i]); + uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + copy = true; + } + } + } + + if (copy && !IsHostPtr) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + data_size, DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, data_size, + async_info_ptr); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + } + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ && !IsHostPtr) { + DP("Update pointer (" DPxMOD ") -> [" DPxMOD "]\n", + DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); + uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; + void *TgtPtrBase = (void *)((uint64_t)TgtPtrBegin - Delta); + int rt = Device.data_submit(Pointer_TgtPtrBegin, &TgtPtrBase, + sizeof(void *), async_info_ptr); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + // create shadow pointers for this entry + Device.ShadowMtx.lock(); + Device.ShadowPtrMap[Pointer_HstPtrBegin] = {HstPtrBase, + Pointer_TgtPtrBegin, TgtPtrBase}; + Device.ShadowMtx.unlock(); + } + } + + return OFFLOAD_SUCCESS; +} + +/// Internal function to undo the mapping and retrieve the data from the device. +int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr) { + // process each input. + for (int32_t i = arg_num - 1; i >= 0; --i) { + // Ignore private variables and arrays - there is no mapping for them. + // Also, ignore the use_device_ptr directive, it has no effect here. + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + int64_t data_size = arg_sizes[i]; + // Adjust for proper alignment if this is a combined entry (for structs). + // Look at the next argument - if that is MEMBER_OF this one, then this one + // is a combined entry. + int64_t padding = 0; + const int next_i = i+1; + if (member_of(arg_types[i]) < 0 && next_i < arg_num && + member_of(arg_types[next_i]) == i) { + padding = (int64_t)HstPtrBegin % alignment; + if (padding) { + DP("Using a padding of %" PRId64 " bytes for begin address " DPxMOD + "\n", padding, DPxPTR(HstPtrBegin)); + HstPtrBegin = (char *) HstPtrBegin - padding; + data_size += padding; + } + } + + bool IsLast, IsHostPtr; + bool UpdateRef = !(arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) || + (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ); + bool ForceDelete = arg_types[i] & OMP_TGT_MAPTYPE_DELETE; + bool HasCloseModifier = arg_types[i] & OMP_TGT_MAPTYPE_CLOSE; + + // If PTR_AND_OBJ, HstPtrBegin is address of pointee + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, data_size, IsLast, + UpdateRef, IsHostPtr); + DP("There are %" PRId64 " bytes allocated at target address " DPxMOD + " - is%s last\n", data_size, DPxPTR(TgtPtrBegin), + (IsLast ? "" : " not")); + + bool DelEntry = IsLast || ForceDelete; + + if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + DelEntry = false; // protect parent struct from being deallocated + } + + if ((arg_types[i] & OMP_TGT_MAPTYPE_FROM) || DelEntry) { + // Move data back to the host + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + bool Always = arg_types[i] & OMP_TGT_MAPTYPE_ALWAYS; + bool CopyMember = false; + if (!(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) || + HasCloseModifier) { + if ((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + !(arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ)) { + // Copy data only if the "parent" struct has RefCount==1. + int32_t parent_idx = member_of(arg_types[i]); + uint64_t parent_rc = Device.getMapEntryRefCnt(args[parent_idx]); + assert(parent_rc > 0 && "parent struct not found"); + if (parent_rc == 1) { + CopyMember = true; + } + } + } + + if ((DelEntry || Always || CopyMember) && + !(RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + TgtPtrBegin == HstPtrBegin)) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + data_size, DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, data_size, + async_info_ptr); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data from device failed.\n"); + return OFFLOAD_FAIL; + } + } + } + + // If we copied back to the host a struct/array containing pointers, we + // need to restore the original host pointer values from their shadow + // copies. If the struct is going to be deallocated, remove any remaining + // shadow pointer entries for this struct. + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + data_size; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end();) { + void **ShadowHstPtrAddr = (void**) it->first; + + // An STL map is sorted on its keys; use this property + // to quickly determine when to break out of the loop. + if ((uintptr_t) ShadowHstPtrAddr < lb) { + ++it; + continue; + } + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + + // If we copied the struct to the host, we need to restore the pointer. + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + DP("Restoring original host pointer value " DPxMOD " for host " + "pointer " DPxMOD "\n", DPxPTR(it->second.HstPtrVal), + DPxPTR(ShadowHstPtrAddr)); + *ShadowHstPtrAddr = it->second.HstPtrVal; + } + // If the struct is to be deallocated, remove the shadow entry. + if (DelEntry) { + DP("Removing shadow pointer " DPxMOD "\n", DPxPTR(ShadowHstPtrAddr)); + it = Device.ShadowPtrMap.erase(it); + } else { + ++it; + } + } + Device.ShadowMtx.unlock(); + + // Deallocate map + if (DelEntry) { + int rt = Device.deallocTgtPtr(HstPtrBegin, data_size, ForceDelete, + HasCloseModifier); + if (rt != OFFLOAD_SUCCESS) { + DP("Deallocating data from device failed.\n"); + return OFFLOAD_FAIL; + } + } + } + } + + return OFFLOAD_SUCCESS; +} + +/// Internal function to pass data to/from the target. +int target_data_update(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types) { + // process each input. + for (int32_t i = 0; i < arg_num; ++i) { + if ((arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) || + (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE)) + continue; + + void *HstPtrBegin = args[i]; + int64_t MapSize = arg_sizes[i]; + bool IsLast, IsHostPtr; + void *TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, MapSize, IsLast, + false, IsHostPtr); + if (!TgtPtrBegin) { + DP("hst data:" DPxMOD " not found, becomes a noop\n", DPxPTR(HstPtrBegin)); + continue; + } + + if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + TgtPtrBegin == HstPtrBegin) { + DP("hst data:" DPxMOD " unified and shared, becomes a noop\n", + DPxPTR(HstPtrBegin)); + continue; + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_FROM) { + DP("Moving %" PRId64 " bytes (tgt:" DPxMOD ") -> (hst:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin)); + int rt = Device.data_retrieve(HstPtrBegin, TgtPtrBegin, MapSize, nullptr); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data from device failed.\n"); + return OFFLOAD_FAIL; + } + + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void**) it->first; + if ((uintptr_t) ShadowHstPtrAddr < lb) + continue; + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + DP("Restoring original host pointer value " DPxMOD " for host pointer " + DPxMOD "\n", DPxPTR(it->second.HstPtrVal), + DPxPTR(ShadowHstPtrAddr)); + *ShadowHstPtrAddr = it->second.HstPtrVal; + } + Device.ShadowMtx.unlock(); + } + + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", + arg_sizes[i], DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, MapSize, nullptr); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + + uintptr_t lb = (uintptr_t) HstPtrBegin; + uintptr_t ub = (uintptr_t) HstPtrBegin + MapSize; + Device.ShadowMtx.lock(); + for (ShadowPtrListTy::iterator it = Device.ShadowPtrMap.begin(); + it != Device.ShadowPtrMap.end(); ++it) { + void **ShadowHstPtrAddr = (void**) it->first; + if ((uintptr_t) ShadowHstPtrAddr < lb) + continue; + if ((uintptr_t) ShadowHstPtrAddr >= ub) + break; + DP("Restoring original target pointer value " DPxMOD " for target " + "pointer " DPxMOD "\n", DPxPTR(it->second.TgtPtrVal), + DPxPTR(it->second.TgtPtrAddr)); + rt = Device.data_submit(it->second.TgtPtrAddr, + &it->second.TgtPtrVal, sizeof(void *), nullptr); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + Device.ShadowMtx.unlock(); + return OFFLOAD_FAIL; + } + } + Device.ShadowMtx.unlock(); + } + } + return OFFLOAD_SUCCESS; +} + +static const unsigned LambdaMapping = OMP_TGT_MAPTYPE_PTR_AND_OBJ | + OMP_TGT_MAPTYPE_LITERAL | + OMP_TGT_MAPTYPE_IMPLICIT; +static bool isLambdaMapping(int64_t Mapping) { + return (Mapping & LambdaMapping) == LambdaMapping; +} + +/// performs the same actions as data_begin in case arg_num is +/// non-zero and initiates run of the offloaded region on the target platform; +/// if arg_num is non-zero after the region execution is done it also +/// performs the same action as data_update and data_end above. This function +/// returns 0 if it was able to transfer the execution to a target and an +/// integer different from zero otherwise. +int target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, int IsTeamConstruct) { + DeviceTy &Device = Devices[device_id]; + + // Find the table information in the map or look it up in the translation + // tables. + TableMap *TM = 0; + TblMapMtx->lock(); + HostPtrToTableMapTy::iterator TableMapIt = HostPtrToTableMap->find(host_ptr); + if (TableMapIt == HostPtrToTableMap->end()) { + // We don't have a map. So search all the registered libraries. + TrlTblMtx->lock(); + for (HostEntriesBeginToTransTableTy::iterator + ii = HostEntriesBeginToTransTable->begin(), + ie = HostEntriesBeginToTransTable->end(); + !TM && ii != ie; ++ii) { + // get the translation table (which contains all the good info). + TranslationTable *TransTable = &ii->second; + // iterate over all the host table entries to see if we can locate the + // host_ptr. + __tgt_offload_entry *begin = TransTable->HostTable.EntriesBegin; + __tgt_offload_entry *end = TransTable->HostTable.EntriesEnd; + __tgt_offload_entry *cur = begin; + for (uint32_t i = 0; cur < end; ++cur, ++i) { + if (cur->addr != host_ptr) + continue; + // we got a match, now fill the HostPtrToTableMap so that we + // may avoid this search next time. + TM = &(*HostPtrToTableMap)[host_ptr]; + TM->Table = TransTable; + TM->Index = i; + break; + } + } + TrlTblMtx->unlock(); + } else { + TM = &TableMapIt->second; + } + TblMapMtx->unlock(); + + // No map for this host pointer found! + if (!TM) { + DP("Host ptr " DPxMOD " does not have a matching target pointer.\n", + DPxPTR(host_ptr)); + return OFFLOAD_FAIL; + } + + // get target table. + TrlTblMtx->lock(); + assert(TM->Table->TargetsTable.size() > (size_t)device_id && + "Not expecting a device ID outside the table's bounds!"); + __tgt_target_table *TargetTable = TM->Table->TargetsTable[device_id]; + TrlTblMtx->unlock(); + assert(TargetTable && "Global data has not been mapped\n"); + + __tgt_async_info AsyncInfo; + + // Move data to device. + int rc = target_data_begin(Device, arg_num, args_base, args, arg_sizes, + arg_types, &AsyncInfo); + if (rc != OFFLOAD_SUCCESS) { + DP("Call to target_data_begin failed, abort target.\n"); + return OFFLOAD_FAIL; + } + + std::vector tgt_args; + std::vector tgt_offsets; + + // List of (first-)private arrays allocated for this target region + std::vector fpArrays; + std::vector tgtArgsPositions(arg_num, -1); + + for (int32_t i = 0; i < arg_num; ++i) { + if (!(arg_types[i] & OMP_TGT_MAPTYPE_TARGET_PARAM)) { + // This is not a target parameter, do not push it into tgt_args. + // Check for lambda mapping. + if (isLambdaMapping(arg_types[i])) { + assert((arg_types[i] & OMP_TGT_MAPTYPE_MEMBER_OF) && + "PTR_AND_OBJ must be also MEMBER_OF."); + unsigned idx = member_of(arg_types[i]); + int tgtIdx = tgtArgsPositions[idx]; + assert(tgtIdx != -1 && "Base address must be translated already."); + // The parent lambda must be processed already and it must be the last + // in tgt_args and tgt_offsets arrays. + void *HstPtrVal = args[i]; + void *HstPtrBegin = args_base[i]; + void *HstPtrBase = args[idx]; + bool IsLast, IsHostPtr; // unused. + void *TgtPtrBase = + (void *)((intptr_t)tgt_args[tgtIdx] + tgt_offsets[tgtIdx]); + DP("Parent lambda base " DPxMOD "\n", DPxPTR(TgtPtrBase)); + uint64_t Delta = (uint64_t)HstPtrBegin - (uint64_t)HstPtrBase; + void *TgtPtrBegin = (void *)((uintptr_t)TgtPtrBase + Delta); + void *Pointer_TgtPtrBegin = + Device.getTgtPtrBegin(HstPtrVal, arg_sizes[i], IsLast, false, + IsHostPtr); + if (!Pointer_TgtPtrBegin) { + DP("No lambda captured variable mapped (" DPxMOD ") - ignored\n", + DPxPTR(HstPtrVal)); + continue; + } + if (RTLs->RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY && + TgtPtrBegin == HstPtrBegin) { + DP("Unified memory is active, no need to map lambda captured" + "variable (" DPxMOD ")\n", DPxPTR(HstPtrVal)); + continue; + } + DP("Update lambda reference (" DPxMOD ") -> [" DPxMOD "]\n", + DPxPTR(Pointer_TgtPtrBegin), DPxPTR(TgtPtrBegin)); + int rt = Device.data_submit(TgtPtrBegin, &Pointer_TgtPtrBegin, + sizeof(void *), &AsyncInfo); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed.\n"); + return OFFLOAD_FAIL; + } + } + continue; + } + void *HstPtrBegin = args[i]; + void *HstPtrBase = args_base[i]; + void *TgtPtrBegin; + ptrdiff_t TgtBaseOffset; + bool IsLast, IsHostPtr; // unused. + if (arg_types[i] & OMP_TGT_MAPTYPE_LITERAL) { + DP("Forwarding first-private value " DPxMOD " to the target construct\n", + DPxPTR(HstPtrBase)); + TgtPtrBegin = HstPtrBase; + TgtBaseOffset = 0; + } else if (arg_types[i] & OMP_TGT_MAPTYPE_PRIVATE) { + // Allocate memory for (first-)private array + TgtPtrBegin = Device.RTL->data_alloc(Device.RTLDeviceID, + arg_sizes[i], HstPtrBegin); + if (!TgtPtrBegin) { + DP ("Data allocation for %sprivate array " DPxMOD " failed, " + "abort target.\n", + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin)); + return OFFLOAD_FAIL; + } + fpArrays.push_back(TgtPtrBegin); + TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; +#ifdef OMPTARGET_DEBUG + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); + DP("Allocated %" PRId64 " bytes of target memory at " DPxMOD " for " + "%sprivate array " DPxMOD " - pushing target argument " DPxMOD "\n", + arg_sizes[i], DPxPTR(TgtPtrBegin), + (arg_types[i] & OMP_TGT_MAPTYPE_TO ? "first-" : ""), + DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBase)); +#endif + // If first-private, copy data from host + if (arg_types[i] & OMP_TGT_MAPTYPE_TO) { + int rt = Device.data_submit(TgtPtrBegin, HstPtrBegin, arg_sizes[i], + &AsyncInfo); + if (rt != OFFLOAD_SUCCESS) { + DP("Copying data to device failed, failed.\n"); + return OFFLOAD_FAIL; + } + } + } else if (arg_types[i] & OMP_TGT_MAPTYPE_PTR_AND_OBJ) { + TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBase, sizeof(void *), IsLast, + false, IsHostPtr); + TgtBaseOffset = 0; // no offset for ptrs. + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD " to " + "object " DPxMOD "\n", DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBase), + DPxPTR(HstPtrBase)); + } else { + TgtPtrBegin = Device.getTgtPtrBegin(HstPtrBegin, arg_sizes[i], IsLast, + false, IsHostPtr); + TgtBaseOffset = (intptr_t)HstPtrBase - (intptr_t)HstPtrBegin; +#ifdef OMPTARGET_DEBUG + void *TgtPtrBase = (void *)((intptr_t)TgtPtrBegin + TgtBaseOffset); + DP("Obtained target argument " DPxMOD " from host pointer " DPxMOD "\n", + DPxPTR(TgtPtrBase), DPxPTR(HstPtrBegin)); +#endif + } + tgtArgsPositions[i] = tgt_args.size(); + tgt_args.push_back(TgtPtrBegin); + tgt_offsets.push_back(TgtBaseOffset); + } + + assert(tgt_args.size() == tgt_offsets.size() && + "Size mismatch in arguments and offsets"); + + // Pop loop trip count + uint64_t ltc = 0; + TblMapMtx->lock(); + auto I = Device.LoopTripCnt.find(__kmpc_global_thread_num(NULL)); + if (I != Device.LoopTripCnt.end()) { + ltc = I->second; + Device.LoopTripCnt.erase(I); + DP("loop trip count is %lu.\n", ltc); + } + TblMapMtx->unlock(); + + // Launch device execution. + DP("Launching target execution %s with pointer " DPxMOD " (index=%d).\n", + TargetTable->EntriesBegin[TM->Index].name, + DPxPTR(TargetTable->EntriesBegin[TM->Index].addr), TM->Index); + if (IsTeamConstruct) { + rc = Device.run_team_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + team_num, thread_limit, ltc, &AsyncInfo); + } else { + rc = Device.run_region(TargetTable->EntriesBegin[TM->Index].addr, + &tgt_args[0], &tgt_offsets[0], tgt_args.size(), + &AsyncInfo); + } + if (rc != OFFLOAD_SUCCESS) { + DP ("Executing target region abort target.\n"); + return OFFLOAD_FAIL; + } + + // Deallocate (first-)private arrays + for (auto it : fpArrays) { + int rt = Device.RTL->data_delete(Device.RTLDeviceID, it); + if (rt != OFFLOAD_SUCCESS) { + DP("Deallocation of (first-)private arrays failed.\n"); + return OFFLOAD_FAIL; + } + } + + // Move data from device. + int rt = target_data_end(Device, arg_num, args_base, args, arg_sizes, + arg_types, &AsyncInfo); + if (rt != OFFLOAD_SUCCESS) { + DP("Call to target_data_end failed, abort targe.\n"); + return OFFLOAD_FAIL; + } + + if (Device.RTL->synchronize) + return Device.RTL->synchronize(device_id, &AsyncInfo); + + return OFFLOAD_SUCCESS; +} diff --git a/openmp/libomptarget/src/private.h b/openmp/libomptarget/src/private.h index dbc5bafbab5bf..866c2e54413ac 100644 --- a/openmp/libomptarget/src/private.h +++ b/openmp/libomptarget/src/private.h @@ -1,108 +1,108 @@ -//===---------- private.h - Target independent OpenMP target RTL ----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Private function declarations and helper macros for debugging output. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_PRIVATE_H -#define _OMPTARGET_PRIVATE_H - -#include - -#include - -extern int target_data_begin(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, - int64_t *arg_types, - __tgt_async_info *async_info_ptr); - -extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, - void **args, int64_t *arg_sizes, int64_t *arg_types, - __tgt_async_info *async_info_ptr); - -extern int target_data_update(DeviceTy &Device, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); - -extern int target(int64_t device_id, void *host_ptr, int32_t arg_num, - void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, - int32_t team_num, int32_t thread_limit, int IsTeamConstruct); - -extern int CheckDeviceAndCtors(int64_t device_id); - -// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition -enum kmp_target_offload_kind { - tgt_disabled = 0, - tgt_default = 1, - tgt_mandatory = 2 -}; -typedef enum kmp_target_offload_kind kmp_target_offload_kind_t; -extern kmp_target_offload_kind_t TargetOffloadPolicy; - -// This structure stores information of a mapped memory region. -struct MapComponentInfoTy { - void *Base; - void *Begin; - int64_t Size; - int64_t Type; - MapComponentInfoTy() = default; - MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type) - : Base(Base), Begin(Begin), Size(Size), Type(Type) {} -}; - -// This structure stores all components of a user-defined mapper. The number of -// components are dynamically decided, so we utilize C++ STL vector -// implementation here. -struct MapperComponentsTy { - std::vector Components; -}; - -//////////////////////////////////////////////////////////////////////////////// -// implementation for fatal messages -//////////////////////////////////////////////////////////////////////////////// - -#define FATAL_MESSAGE0(_num, _str) \ - do { \ - fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \ - exit(1); \ - } while (0) - -#define FATAL_MESSAGE(_num, _str, ...) \ - do { \ - fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \ - __VA_ARGS__); \ - exit(1); \ - } while (0) - -// Implemented in libomp, they are called from within __tgt_* functions. -#ifdef __cplusplus -extern "C" { -#endif -// functions that extract info from libomp; keep in sync -int omp_get_default_device(void) __attribute__((weak)); -int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak)); -int32_t __kmpc_global_thread_num(void *) __attribute__((weak)); -int __kmpc_get_target_offload(void) __attribute__((weak)); -#ifdef __cplusplus -} -#endif - -#ifdef OMPTARGET_DEBUG -extern int DebugLevel; - -#define DP(...) \ - do { \ - if (DebugLevel > 0) { \ - DEBUGP("Libomptarget", __VA_ARGS__); \ - } \ - } while (false) -#else // OMPTARGET_DEBUG -#define DP(...) {} -#endif // OMPTARGET_DEBUG - -#endif +//===---------- private.h - Target independent OpenMP target RTL ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Private function declarations and helper macros for debugging output. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_PRIVATE_H +#define _OMPTARGET_PRIVATE_H + +#include + +#include + +extern int target_data_begin(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, + int64_t *arg_types, + __tgt_async_info *async_info_ptr); + +extern int target_data_end(DeviceTy &Device, int32_t arg_num, void **args_base, + void **args, int64_t *arg_sizes, int64_t *arg_types, + __tgt_async_info *async_info_ptr); + +extern int target_data_update(DeviceTy &Device, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types); + +extern int target(int64_t device_id, void *host_ptr, int32_t arg_num, + void **args_base, void **args, int64_t *arg_sizes, int64_t *arg_types, + int32_t team_num, int32_t thread_limit, int IsTeamConstruct); + +extern int CheckDeviceAndCtors(int64_t device_id); + +// enum for OMP_TARGET_OFFLOAD; keep in sync with kmp.h definition +enum kmp_target_offload_kind { + tgt_disabled = 0, + tgt_default = 1, + tgt_mandatory = 2 +}; +typedef enum kmp_target_offload_kind kmp_target_offload_kind_t; +extern kmp_target_offload_kind_t TargetOffloadPolicy; + +// This structure stores information of a mapped memory region. +struct MapComponentInfoTy { + void *Base; + void *Begin; + int64_t Size; + int64_t Type; + MapComponentInfoTy() = default; + MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type) + : Base(Base), Begin(Begin), Size(Size), Type(Type) {} +}; + +// This structure stores all components of a user-defined mapper. The number of +// components are dynamically decided, so we utilize C++ STL vector +// implementation here. +struct MapperComponentsTy { + std::vector Components; +}; + +//////////////////////////////////////////////////////////////////////////////// +// implementation for fatal messages +//////////////////////////////////////////////////////////////////////////////// + +#define FATAL_MESSAGE0(_num, _str) \ + do { \ + fprintf(stderr, "Libomptarget fatal error %d: %s\n", _num, _str); \ + exit(1); \ + } while (0) + +#define FATAL_MESSAGE(_num, _str, ...) \ + do { \ + fprintf(stderr, "Libomptarget fatal error %d:" _str "\n", _num, \ + __VA_ARGS__); \ + exit(1); \ + } while (0) + +// Implemented in libomp, they are called from within __tgt_* functions. +#ifdef __cplusplus +extern "C" { +#endif +// functions that extract info from libomp; keep in sync +int omp_get_default_device(void) __attribute__((weak)); +int32_t __kmpc_omp_taskwait(void *loc_ref, int32_t gtid) __attribute__((weak)); +int32_t __kmpc_global_thread_num(void *) __attribute__((weak)); +int __kmpc_get_target_offload(void) __attribute__((weak)); +#ifdef __cplusplus +} +#endif + +#ifdef OMPTARGET_DEBUG +extern int DebugLevel; + +#define DP(...) \ + do { \ + if (DebugLevel > 0) { \ + DEBUGP("Libomptarget", __VA_ARGS__); \ + } \ + } while (false) +#else // OMPTARGET_DEBUG +#define DP(...) {} +#endif // OMPTARGET_DEBUG + +#endif diff --git a/openmp/libomptarget/src/rtl.cpp b/openmp/libomptarget/src/rtl.cpp index 1439f67e7c648..7ee8377d33399 100644 --- a/openmp/libomptarget/src/rtl.cpp +++ b/openmp/libomptarget/src/rtl.cpp @@ -1,434 +1,438 @@ -//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Functionality for handling RTL plugins. -// -//===----------------------------------------------------------------------===// - -#include "device.h" -#include "private.h" -#include "rtl.h" - -#include -#include -#include -#include -#include -#include - -// List of all plugins that can support offloading. -static const char *RTLNames[] = { - /* PowerPC target */ "libomptarget.rtl.ppc64.so", - /* x86_64 target */ "libomptarget.rtl.x86_64.so", - /* CUDA target */ "libomptarget.rtl.cuda.so", - /* AArch64 target */ "libomptarget.rtl.aarch64.so"}; - -RTLsTy *RTLs; -std::mutex *RTLsMtx; - -HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable; -std::mutex *TrlTblMtx; - -HostPtrToTableMapTy *HostPtrToTableMap; -std::mutex *TblMapMtx; - -__attribute__((constructor(101))) void init() { - DP("Init target library!\n"); - RTLs = new RTLsTy(); - RTLsMtx = new std::mutex(); - HostEntriesBeginToTransTable = new HostEntriesBeginToTransTableTy(); - TrlTblMtx = new std::mutex(); - HostPtrToTableMap = new HostPtrToTableMapTy(); - TblMapMtx = new std::mutex(); -} - -__attribute__((destructor(101))) void deinit() { - DP("Deinit target library!\n"); - delete RTLs; - delete RTLsMtx; - delete HostEntriesBeginToTransTable; - delete TrlTblMtx; - delete HostPtrToTableMap; - delete TblMapMtx; -} - -void RTLsTy::LoadRTLs() { -#ifdef OMPTARGET_DEBUG - if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { - DebugLevel = std::stoi(envStr); - } -#endif // OMPTARGET_DEBUG - - // Parse environment variable OMP_TARGET_OFFLOAD (if set) - TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload(); - if (TargetOffloadPolicy == tgt_disabled) { - return; - } - - DP("Loading RTLs...\n"); - - // Attempt to open all the plugins and, if they exist, check if the interface - // is correct and if they are supporting any devices. - for (auto *Name : RTLNames) { - DP("Loading library '%s'...\n", Name); - void *dynlib_handle = dlopen(Name, RTLD_NOW); - - if (!dynlib_handle) { - // Library does not exist or cannot be found. - DP("Unable to load library '%s': %s!\n", Name, dlerror()); - continue; - } - - DP("Successfully loaded library '%s'!\n", Name); - - // Retrieve the RTL information from the runtime library. - RTLInfoTy R; - - R.LibraryHandler = dynlib_handle; - R.isUsed = false; - -#ifdef OMPTARGET_DEBUG - R.RTLName = Name; -#endif - - if (!(*((void **)&R.is_valid_binary) = - dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary"))) - continue; - if (!(*((void **)&R.number_of_devices) = - dlsym(dynlib_handle, "__tgt_rtl_number_of_devices"))) - continue; - if (!(*((void **)&R.init_device) = - dlsym(dynlib_handle, "__tgt_rtl_init_device"))) - continue; - if (!(*((void **)&R.load_binary) = - dlsym(dynlib_handle, "__tgt_rtl_load_binary"))) - continue; - if (!(*((void **)&R.data_alloc) = - dlsym(dynlib_handle, "__tgt_rtl_data_alloc"))) - continue; - if (!(*((void **)&R.data_submit) = - dlsym(dynlib_handle, "__tgt_rtl_data_submit"))) - continue; - if (!(*((void **)&R.data_retrieve) = - dlsym(dynlib_handle, "__tgt_rtl_data_retrieve"))) - continue; - if (!(*((void **)&R.data_delete) = - dlsym(dynlib_handle, "__tgt_rtl_data_delete"))) - continue; - if (!(*((void **)&R.run_region) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_region"))) - continue; - if (!(*((void **)&R.run_team_region) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region"))) - continue; - - // Optional functions - *((void **)&R.init_requires) = - dlsym(dynlib_handle, "__tgt_rtl_init_requires"); - *((void **)&R.data_submit_async) = - dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); - *((void **)&R.data_retrieve_async) = - dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async"); - *((void **)&R.run_region_async) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async"); - *((void **)&R.run_team_region_async) = - dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); - *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); - - // No devices are supported by this RTL? - if (!(R.NumberOfDevices = R.number_of_devices())) { - DP("No devices supported in this RTL\n"); - continue; - } - - DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(), - R.NumberOfDevices); - - // The RTL is valid! Will save the information in the RTLs list. - AllRTLs.push_back(R); - } - - DP("RTLs loaded!\n"); - - return; -} - -//////////////////////////////////////////////////////////////////////////////// -// Functionality for registering libs - -static void RegisterImageIntoTranslationTable(TranslationTable &TT, - RTLInfoTy &RTL, __tgt_device_image *image) { - - // same size, as when we increase one, we also increase the other. - assert(TT.TargetsTable.size() == TT.TargetsImages.size() && - "We should have as many images as we have tables!"); - - // Resize the Targets Table and Images to accommodate the new targets if - // required - unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices; - - if (TT.TargetsTable.size() < TargetsTableMinimumSize) { - TT.TargetsImages.resize(TargetsTableMinimumSize, 0); - TT.TargetsTable.resize(TargetsTableMinimumSize, 0); - } - - // Register the image in all devices for this target type. - for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) { - // If we are changing the image we are also invalidating the target table. - if (TT.TargetsImages[RTL.Idx + i] != image) { - TT.TargetsImages[RTL.Idx + i] = image; - TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table. - } - } -} - -//////////////////////////////////////////////////////////////////////////////// -// Functionality for registering Ctors/Dtors - -static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc, - __tgt_device_image *img, RTLInfoTy *RTL) { - - for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) { - DeviceTy &Device = Devices[RTL->Idx + i]; - Device.PendingGlobalsMtx.lock(); - Device.HasPendingGlobals = true; - for (__tgt_offload_entry *entry = img->EntriesBegin; - entry != img->EntriesEnd; ++entry) { - if (entry->flags & OMP_DECLARE_TARGET_CTOR) { - DP("Adding ctor " DPxMOD " to the pending list.\n", - DPxPTR(entry->addr)); - Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr); - } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) { - // Dtors are pushed in reverse order so they are executed from end - // to beginning when unregistering the library! - DP("Adding dtor " DPxMOD " to the pending list.\n", - DPxPTR(entry->addr)); - Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr); - } - - if (entry->flags & OMP_DECLARE_TARGET_LINK) { - DP("The \"link\" attribute is not yet supported!\n"); - } - } - Device.PendingGlobalsMtx.unlock(); - } -} - -void RTLsTy::RegisterRequires(int64_t flags) { - // TODO: add more elaborate check. - // Minimal check: only set requires flags if previous value - // is undefined. This ensures that only the first call to this - // function will set the requires flags. All subsequent calls - // will be checked for compatibility. - assert(flags != OMP_REQ_UNDEFINED && - "illegal undefined flag for requires directive!"); - if (RequiresFlags == OMP_REQ_UNDEFINED) { - RequiresFlags = flags; - return; - } - - // If multiple compilation units are present enforce - // consistency across all of them for require clauses: - // - reverse_offload - // - unified_address - // - unified_shared_memory - if ((RequiresFlags & OMP_REQ_REVERSE_OFFLOAD) != - (flags & OMP_REQ_REVERSE_OFFLOAD)) { - FATAL_MESSAGE0(1, - "'#pragma omp requires reverse_offload' not used consistently!"); - } - if ((RequiresFlags & OMP_REQ_UNIFIED_ADDRESS) != - (flags & OMP_REQ_UNIFIED_ADDRESS)) { - FATAL_MESSAGE0(1, - "'#pragma omp requires unified_address' not used consistently!"); - } - if ((RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) != - (flags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { - FATAL_MESSAGE0(1, - "'#pragma omp requires unified_shared_memory' not used consistently!"); - } - - // TODO: insert any other missing checks - - DP("New requires flags %ld compatible with existing %ld!\n", - flags, RequiresFlags); -} - -void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { - // Attempt to load all plugins available in the system. - std::call_once(initFlag, &RTLsTy::LoadRTLs, this); - - RTLsMtx->lock(); - // Register the images with the RTLs that understand them, if any. - for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { - // Obtain the image. - __tgt_device_image *img = &desc->DeviceImages[i]; - - RTLInfoTy *FoundRTL = NULL; - - // Scan the RTLs that have associated images until we find one that supports - // the current image. - for (auto &R : AllRTLs) { - if (!R.is_valid_binary(img)) { - DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", - DPxPTR(img->ImageStart), R.RTLName.c_str()); - continue; - } - - DP("Image " DPxMOD " is compatible with RTL %s!\n", - DPxPTR(img->ImageStart), R.RTLName.c_str()); - - // If this RTL is not already in use, initialize it. - if (!R.isUsed) { - // Initialize the device information for the RTL we are about to use. - DeviceTy device(&R); - size_t start = Devices.size(); - Devices.resize(start + R.NumberOfDevices, device); - for (int32_t device_id = 0; device_id < R.NumberOfDevices; - device_id++) { - // global device ID - Devices[start + device_id].DeviceID = start + device_id; - // RTL local device ID - Devices[start + device_id].RTLDeviceID = device_id; - } - - // Initialize the index of this RTL and save it in the used RTLs. - R.Idx = (UsedRTLs.empty()) - ? 0 - : UsedRTLs.back()->Idx + UsedRTLs.back()->NumberOfDevices; - assert((size_t) R.Idx == start && - "RTL index should equal the number of devices used so far."); - R.isUsed = true; - UsedRTLs.push_back(&R); - - DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx); - } - - // Initialize (if necessary) translation table for this library. - TrlTblMtx->lock(); - if(!HostEntriesBeginToTransTable->count(desc->HostEntriesBegin)){ - TranslationTable &tt = - (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; - tt.HostTable.EntriesBegin = desc->HostEntriesBegin; - tt.HostTable.EntriesEnd = desc->HostEntriesEnd; - } - - // Retrieve translation table for this library. - TranslationTable &TransTable = - (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; - - DP("Registering image " DPxMOD " with RTL %s!\n", - DPxPTR(img->ImageStart), R.RTLName.c_str()); - RegisterImageIntoTranslationTable(TransTable, R, img); - TrlTblMtx->unlock(); - FoundRTL = &R; - - // Load ctors/dtors for static objects - RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL); - - // if an RTL was found we are done - proceed to register the next image - break; - } - - if (!FoundRTL) { - DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart)); - } - } - RTLsMtx->unlock(); - - - DP("Done registering entries!\n"); -} - -void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) { - DP("Unloading target library!\n"); - - RTLsMtx->lock(); - // Find which RTL understands each image, if any. - for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { - // Obtain the image. - __tgt_device_image *img = &desc->DeviceImages[i]; - - RTLInfoTy *FoundRTL = NULL; - - // Scan the RTLs that have associated images until we find one that supports - // the current image. We only need to scan RTLs that are already being used. - for (auto *R : UsedRTLs) { - - assert(R->isUsed && "Expecting used RTLs."); - - if (!R->is_valid_binary(img)) { - DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n", - DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); - continue; - } - - DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n", - DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); - - FoundRTL = R; - - // Execute dtors for static objects if the device has been used, i.e. - // if its PendingCtors list has been emptied. - for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) { - DeviceTy &Device = Devices[FoundRTL->Idx + i]; - Device.PendingGlobalsMtx.lock(); - if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { - for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { - int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1, - 1, true /*team*/); - if (rc != OFFLOAD_SUCCESS) { - DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); - } - } - // Remove this library's entry from PendingCtorsDtors - Device.PendingCtorsDtors.erase(desc); - } - Device.PendingGlobalsMtx.unlock(); - } - - DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n", - DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); - - break; - } - - // if no RTL was found proceed to unregister the next image - if (!FoundRTL){ - DP("No RTLs in use support the image " DPxMOD "!\n", - DPxPTR(img->ImageStart)); - } - } - RTLsMtx->unlock(); - DP("Done unregistering images!\n"); - - // Remove entries from HostPtrToTableMap - TblMapMtx->lock(); - for (__tgt_offload_entry *cur = desc->HostEntriesBegin; - cur < desc->HostEntriesEnd; ++cur) { - HostPtrToTableMap->erase(cur->addr); - } - - // Remove translation table for this descriptor. - auto tt = HostEntriesBeginToTransTable->find(desc->HostEntriesBegin); - if (tt != HostEntriesBeginToTransTable->end()) { - DP("Removing translation table for descriptor " DPxMOD "\n", - DPxPTR(desc->HostEntriesBegin)); - HostEntriesBeginToTransTable->erase(tt); - } else { - DP("Translation table for descriptor " DPxMOD " cannot be found, probably " - "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin)); - } - - TblMapMtx->unlock(); - - // TODO: Remove RTL and the devices it manages if it's not used anymore? - // TODO: Write some RTL->unload_image(...) function? - - DP("Done unregistering library!\n"); -} +//===----------- rtl.cpp - Target independent OpenMP target RTL -----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Functionality for handling RTL plugins. +// +//===----------------------------------------------------------------------===// + +#include "device.h" +#include "private.h" +#include "rtl.h" + +#include +#include +#include +#include +#include +#include + +// List of all plugins that can support offloading. +static const char *RTLNames[] = { + /* PowerPC target */ "libomptarget.rtl.ppc64.so", + /* x86_64 target */ "libomptarget.rtl.x86_64.so", + /* CUDA target */ "libomptarget.rtl.cuda.so", + /* AArch64 target */ "libomptarget.rtl.aarch64.so"}; + +RTLsTy *RTLs; +std::mutex *RTLsMtx; + +HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable; +std::mutex *TrlTblMtx; + +HostPtrToTableMapTy *HostPtrToTableMap; +std::mutex *TblMapMtx; + +__attribute__((constructor(101))) void init() { + DP("Init target library!\n"); + RTLs = new RTLsTy(); + RTLsMtx = new std::mutex(); + HostEntriesBeginToTransTable = new HostEntriesBeginToTransTableTy(); + TrlTblMtx = new std::mutex(); + HostPtrToTableMap = new HostPtrToTableMapTy(); + TblMapMtx = new std::mutex(); +} + +__attribute__((destructor(101))) void deinit() { + DP("Deinit target library!\n"); + delete RTLs; + delete RTLsMtx; + delete HostEntriesBeginToTransTable; + delete TrlTblMtx; + delete HostPtrToTableMap; + delete TblMapMtx; +} + +void RTLsTy::LoadRTLs() { +#ifdef OMPTARGET_DEBUG + if (char *envStr = getenv("LIBOMPTARGET_DEBUG")) { + DebugLevel = std::stoi(envStr); + } +#endif // OMPTARGET_DEBUG + + // Parse environment variable OMP_TARGET_OFFLOAD (if set) + TargetOffloadPolicy = (kmp_target_offload_kind_t) __kmpc_get_target_offload(); + if (TargetOffloadPolicy == tgt_disabled) { + return; + } + + DP("Loading RTLs...\n"); + + // Attempt to open all the plugins and, if they exist, check if the interface + // is correct and if they are supporting any devices. + for (auto *Name : RTLNames) { + DP("Loading library '%s'...\n", Name); + void *dynlib_handle = dlopen(Name, RTLD_NOW); + + if (!dynlib_handle) { + // Library does not exist or cannot be found. + DP("Unable to load library '%s': %s!\n", Name, dlerror()); + continue; + } + + DP("Successfully loaded library '%s'!\n", Name); + + // Retrieve the RTL information from the runtime library. + RTLInfoTy R; + + R.LibraryHandler = dynlib_handle; + R.isUsed = false; + + + R.RTLName = Name; + + + if (!(*((void **)&R.is_valid_binary) = + dlsym(dynlib_handle, "__tgt_rtl_is_valid_binary"))) + continue; + if (!(*((void **)&R.number_of_devices) = + dlsym(dynlib_handle, "__tgt_rtl_number_of_devices"))) + continue; + if (!(*((void **)&R.init_device) = + dlsym(dynlib_handle, "__tgt_rtl_init_device"))) + continue; + if (!(*((void **)&R.load_binary) = + dlsym(dynlib_handle, "__tgt_rtl_load_binary"))) + continue; + if (!(*((void **)&R.data_alloc) = + dlsym(dynlib_handle, "__tgt_rtl_data_alloc"))) + continue; + if (!(*((void **)&R.data_submit) = + dlsym(dynlib_handle, "__tgt_rtl_data_submit"))) + continue; + if (!(*((void **)&R.data_retrieve) = + dlsym(dynlib_handle, "__tgt_rtl_data_retrieve"))) + continue; + if (!(*((void **)&R.data_delete) = + dlsym(dynlib_handle, "__tgt_rtl_data_delete"))) + continue; + if (!(*((void **)&R.run_region) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_region"))) + continue; + if (!(*((void **)&R.run_team_region) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region"))) + continue; + + // Optional functions + *((void **)&R.init_requires) = + dlsym(dynlib_handle, "__tgt_rtl_init_requires"); + *((void **)&R.data_submit_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_submit_async"); + *((void **)&R.data_retrieve_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_retrieve_async"); + *((void **)&R.data_transfer) = + dlsym(dynlib_handle, "__tgt_rtl_data_transfer"); + *((void **)&R.data_transfer_async) = + dlsym(dynlib_handle, "__tgt_rtl_data_transfer_async"); + *((void **)&R.run_region_async) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_region_async"); + *((void **)&R.run_team_region_async) = + dlsym(dynlib_handle, "__tgt_rtl_run_target_team_region_async"); + *((void **)&R.synchronize) = dlsym(dynlib_handle, "__tgt_rtl_synchronize"); + + // No devices are supported by this RTL? + if (!(R.NumberOfDevices = R.number_of_devices())) { + DP("No devices supported in this RTL\n"); + continue; + } + + DP("Registering RTL %s supporting %d devices!\n", R.RTLName.c_str(), + R.NumberOfDevices); + + // The RTL is valid! Will save the information in the RTLs list. + AllRTLs.push_back(R); + } + + DP("RTLs loaded!\n"); + + return; +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering libs + +static void RegisterImageIntoTranslationTable(TranslationTable &TT, + RTLInfoTy &RTL, __tgt_device_image *image) { + + // same size, as when we increase one, we also increase the other. + assert(TT.TargetsTable.size() == TT.TargetsImages.size() && + "We should have as many images as we have tables!"); + + // Resize the Targets Table and Images to accommodate the new targets if + // required + unsigned TargetsTableMinimumSize = RTL.Idx + RTL.NumberOfDevices; + + if (TT.TargetsTable.size() < TargetsTableMinimumSize) { + TT.TargetsImages.resize(TargetsTableMinimumSize, 0); + TT.TargetsTable.resize(TargetsTableMinimumSize, 0); + } + + // Register the image in all devices for this target type. + for (int32_t i = 0; i < RTL.NumberOfDevices; ++i) { + // If we are changing the image we are also invalidating the target table. + if (TT.TargetsImages[RTL.Idx + i] != image) { + TT.TargetsImages[RTL.Idx + i] = image; + TT.TargetsTable[RTL.Idx + i] = 0; // lazy initialization of target table. + } + } +} + +//////////////////////////////////////////////////////////////////////////////// +// Functionality for registering Ctors/Dtors + +static void RegisterGlobalCtorsDtorsForImage(__tgt_bin_desc *desc, + __tgt_device_image *img, RTLInfoTy *RTL) { + + for (int32_t i = 0; i < RTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[RTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + Device.HasPendingGlobals = true; + for (__tgt_offload_entry *entry = img->EntriesBegin; + entry != img->EntriesEnd; ++entry) { + if (entry->flags & OMP_DECLARE_TARGET_CTOR) { + DP("Adding ctor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingCtors.push_back(entry->addr); + } else if (entry->flags & OMP_DECLARE_TARGET_DTOR) { + // Dtors are pushed in reverse order so they are executed from end + // to beginning when unregistering the library! + DP("Adding dtor " DPxMOD " to the pending list.\n", + DPxPTR(entry->addr)); + Device.PendingCtorsDtors[desc].PendingDtors.push_front(entry->addr); + } + + if (entry->flags & OMP_DECLARE_TARGET_LINK) { + DP("The \"link\" attribute is not yet supported!\n"); + } + } + Device.PendingGlobalsMtx.unlock(); + } +} + +void RTLsTy::RegisterRequires(int64_t flags) { + // TODO: add more elaborate check. + // Minimal check: only set requires flags if previous value + // is undefined. This ensures that only the first call to this + // function will set the requires flags. All subsequent calls + // will be checked for compatibility. + assert(flags != OMP_REQ_UNDEFINED && + "illegal undefined flag for requires directive!"); + if (RequiresFlags == OMP_REQ_UNDEFINED) { + RequiresFlags = flags; + return; + } + + // If multiple compilation units are present enforce + // consistency across all of them for require clauses: + // - reverse_offload + // - unified_address + // - unified_shared_memory + if ((RequiresFlags & OMP_REQ_REVERSE_OFFLOAD) != + (flags & OMP_REQ_REVERSE_OFFLOAD)) { + FATAL_MESSAGE0(1, + "'#pragma omp requires reverse_offload' not used consistently!"); + } + if ((RequiresFlags & OMP_REQ_UNIFIED_ADDRESS) != + (flags & OMP_REQ_UNIFIED_ADDRESS)) { + FATAL_MESSAGE0(1, + "'#pragma omp requires unified_address' not used consistently!"); + } + if ((RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) != + (flags & OMP_REQ_UNIFIED_SHARED_MEMORY)) { + FATAL_MESSAGE0(1, + "'#pragma omp requires unified_shared_memory' not used consistently!"); + } + + // TODO: insert any other missing checks + + DP("New requires flags %ld compatible with existing %ld!\n", + flags, RequiresFlags); +} + +void RTLsTy::RegisterLib(__tgt_bin_desc *desc) { + // Attempt to load all plugins available in the system. + std::call_once(initFlag, &RTLsTy::LoadRTLs, this); + + RTLsMtx->lock(); + // Register the images with the RTLs that understand them, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. + for (auto &R : AllRTLs) { + if (!R.is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + + // If this RTL is not already in use, initialize it. + if (!R.isUsed) { + // Initialize the device information for the RTL we are about to use. + DeviceTy device(&R); + size_t start = Devices.size(); + Devices.resize(start + R.NumberOfDevices, device); + for (int32_t device_id = 0; device_id < R.NumberOfDevices; + device_id++) { + // global device ID + Devices[start + device_id].DeviceID = start + device_id; + // RTL local device ID + Devices[start + device_id].RTLDeviceID = device_id; + } + + // Initialize the index of this RTL and save it in the used RTLs. + R.Idx = (UsedRTLs.empty()) + ? 0 + : UsedRTLs.back()->Idx + UsedRTLs.back()->NumberOfDevices; + assert((size_t) R.Idx == start && + "RTL index should equal the number of devices used so far."); + R.isUsed = true; + UsedRTLs.push_back(&R); + + DP("RTL " DPxMOD " has index %d!\n", DPxPTR(R.LibraryHandler), R.Idx); + } + + // Initialize (if necessary) translation table for this library. + TrlTblMtx->lock(); + if(!HostEntriesBeginToTransTable->count(desc->HostEntriesBegin)){ + TranslationTable &tt = + (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; + tt.HostTable.EntriesBegin = desc->HostEntriesBegin; + tt.HostTable.EntriesEnd = desc->HostEntriesEnd; + } + + // Retrieve translation table for this library. + TranslationTable &TransTable = + (*HostEntriesBeginToTransTable)[desc->HostEntriesBegin]; + + DP("Registering image " DPxMOD " with RTL %s!\n", + DPxPTR(img->ImageStart), R.RTLName.c_str()); + RegisterImageIntoTranslationTable(TransTable, R, img); + TrlTblMtx->unlock(); + FoundRTL = &R; + + // Load ctors/dtors for static objects + RegisterGlobalCtorsDtorsForImage(desc, img, FoundRTL); + + // if an RTL was found we are done - proceed to register the next image + break; + } + + if (!FoundRTL) { + DP("No RTL found for image " DPxMOD "!\n", DPxPTR(img->ImageStart)); + } + } + RTLsMtx->unlock(); + + + DP("Done registering entries!\n"); +} + +void RTLsTy::UnregisterLib(__tgt_bin_desc *desc) { + DP("Unloading target library!\n"); + + RTLsMtx->lock(); + // Find which RTL understands each image, if any. + for (int32_t i = 0; i < desc->NumDeviceImages; ++i) { + // Obtain the image. + __tgt_device_image *img = &desc->DeviceImages[i]; + + RTLInfoTy *FoundRTL = NULL; + + // Scan the RTLs that have associated images until we find one that supports + // the current image. We only need to scan RTLs that are already being used. + for (auto *R : UsedRTLs) { + + assert(R->isUsed && "Expecting used RTLs."); + + if (!R->is_valid_binary(img)) { + DP("Image " DPxMOD " is NOT compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + continue; + } + + DP("Image " DPxMOD " is compatible with RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + FoundRTL = R; + + // Execute dtors for static objects if the device has been used, i.e. + // if its PendingCtors list has been emptied. + for (int32_t i = 0; i < FoundRTL->NumberOfDevices; ++i) { + DeviceTy &Device = Devices[FoundRTL->Idx + i]; + Device.PendingGlobalsMtx.lock(); + if (Device.PendingCtorsDtors[desc].PendingCtors.empty()) { + for (auto &dtor : Device.PendingCtorsDtors[desc].PendingDtors) { + int rc = target(Device.DeviceID, dtor, 0, NULL, NULL, NULL, NULL, 1, + 1, true /*team*/); + if (rc != OFFLOAD_SUCCESS) { + DP("Running destructor " DPxMOD " failed.\n", DPxPTR(dtor)); + } + } + // Remove this library's entry from PendingCtorsDtors + Device.PendingCtorsDtors.erase(desc); + } + Device.PendingGlobalsMtx.unlock(); + } + + DP("Unregistered image " DPxMOD " from RTL " DPxMOD "!\n", + DPxPTR(img->ImageStart), DPxPTR(R->LibraryHandler)); + + break; + } + + // if no RTL was found proceed to unregister the next image + if (!FoundRTL){ + DP("No RTLs in use support the image " DPxMOD "!\n", + DPxPTR(img->ImageStart)); + } + } + RTLsMtx->unlock(); + DP("Done unregistering images!\n"); + + // Remove entries from HostPtrToTableMap + TblMapMtx->lock(); + for (__tgt_offload_entry *cur = desc->HostEntriesBegin; + cur < desc->HostEntriesEnd; ++cur) { + HostPtrToTableMap->erase(cur->addr); + } + + // Remove translation table for this descriptor. + auto tt = HostEntriesBeginToTransTable->find(desc->HostEntriesBegin); + if (tt != HostEntriesBeginToTransTable->end()) { + DP("Removing translation table for descriptor " DPxMOD "\n", + DPxPTR(desc->HostEntriesBegin)); + HostEntriesBeginToTransTable->erase(tt); + } else { + DP("Translation table for descriptor " DPxMOD " cannot be found, probably " + "it has been already removed.\n", DPxPTR(desc->HostEntriesBegin)); + } + + TblMapMtx->unlock(); + + // TODO: Remove RTL and the devices it manages if it's not used anymore? + // TODO: Write some RTL->unload_image(...) function? + + DP("Done unregistering library!\n"); +} diff --git a/openmp/libomptarget/src/rtl.h b/openmp/libomptarget/src/rtl.h index 86ecd6724a8df..62f6f466e6cee 100644 --- a/openmp/libomptarget/src/rtl.h +++ b/openmp/libomptarget/src/rtl.h @@ -1,185 +1,192 @@ -//===------------ rtl.h - Target independent OpenMP target RTL ------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// Declarations for handling RTL plugins. -// -//===----------------------------------------------------------------------===// - -#ifndef _OMPTARGET_RTL_H -#define _OMPTARGET_RTL_H - -#include "omptarget.h" -#include -#include -#include -#include -#include - -// Forward declarations. -struct DeviceTy; -struct __tgt_bin_desc; - -struct RTLInfoTy { - typedef int32_t(is_valid_binary_ty)(void *); - typedef int32_t(number_of_devices_ty)(); - typedef int32_t(init_device_ty)(int32_t); - typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); - typedef void *(data_alloc_ty)(int32_t, int64_t, void *); - typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); - typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); - typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, - __tgt_async_info *); - typedef int32_t(data_delete_ty)(int32_t, void *); - typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t); - typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, __tgt_async_info *); - typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, - int32_t, int32_t, int32_t, uint64_t); - typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **, - ptrdiff_t *, int32_t, int32_t, - int32_t, uint64_t, - __tgt_async_info *); - typedef int64_t(init_requires_ty)(int64_t); - typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); - - int32_t Idx = -1; // RTL index, index is the number of devices - // of other RTLs that were registered before, - // i.e. the OpenMP index of the first device - // to be registered with this RTL. - int32_t NumberOfDevices = -1; // Number of devices this RTL deals with. - - void *LibraryHandler = nullptr; - -#ifdef OMPTARGET_DEBUG - std::string RTLName; -#endif - - // Functions implemented in the RTL. - is_valid_binary_ty *is_valid_binary = nullptr; - number_of_devices_ty *number_of_devices = nullptr; - init_device_ty *init_device = nullptr; - load_binary_ty *load_binary = nullptr; - data_alloc_ty *data_alloc = nullptr; - data_submit_ty *data_submit = nullptr; - data_submit_async_ty *data_submit_async = nullptr; - data_retrieve_ty *data_retrieve = nullptr; - data_retrieve_async_ty *data_retrieve_async = nullptr; - data_delete_ty *data_delete = nullptr; - run_region_ty *run_region = nullptr; - run_region_async_ty *run_region_async = nullptr; - run_team_region_ty *run_team_region = nullptr; - run_team_region_async_ty *run_team_region_async = nullptr; - init_requires_ty *init_requires = nullptr; - synchronize_ty *synchronize = nullptr; - - // Are there images associated with this RTL. - bool isUsed = false; - - // Mutex for thread-safety when calling RTL interface functions. - // It is easier to enforce thread-safety at the libomptarget level, - // so that developers of new RTLs do not have to worry about it. - std::mutex Mtx; - - // The existence of the mutex above makes RTLInfoTy non-copyable. - // We need to provide a copy constructor explicitly. - RTLInfoTy() = default; - - RTLInfoTy(const RTLInfoTy &r) { - Idx = r.Idx; - NumberOfDevices = r.NumberOfDevices; - LibraryHandler = r.LibraryHandler; -#ifdef OMPTARGET_DEBUG - RTLName = r.RTLName; -#endif - is_valid_binary = r.is_valid_binary; - number_of_devices = r.number_of_devices; - init_device = r.init_device; - load_binary = r.load_binary; - data_alloc = r.data_alloc; - data_submit = r.data_submit; - data_submit_async = r.data_submit_async; - data_retrieve = r.data_retrieve; - data_retrieve_async = r.data_retrieve_async; - data_delete = r.data_delete; - run_region = r.run_region; - run_region_async = r.run_region_async; - run_team_region = r.run_team_region; - run_team_region_async = r.run_team_region_async; - init_requires = r.init_requires; - isUsed = r.isUsed; - synchronize = r.synchronize; - } -}; - -/// RTLs identified in the system. -class RTLsTy { -private: - // Mutex-like object to guarantee thread-safety and unique initialization - // (i.e. the library attempts to load the RTLs (plugins) only once). - std::once_flag initFlag; - void LoadRTLs(); // not thread-safe - -public: - // List of the detected runtime libraries. - std::list AllRTLs; - - // Array of pointers to the detected runtime libraries that have compatible - // binaries. - std::vector UsedRTLs; - - int64_t RequiresFlags = OMP_REQ_UNDEFINED; - - explicit RTLsTy() = default; - - // Register the clauses of the requires directive. - void RegisterRequires(int64_t flags); - - // Register a shared library with all (compatible) RTLs. - void RegisterLib(__tgt_bin_desc *desc); - - // Unregister a shared library from all RTLs. - void UnregisterLib(__tgt_bin_desc *desc); -}; -extern RTLsTy *RTLs; -extern std::mutex *RTLsMtx; - - -/// Map between the host entry begin and the translation table. Each -/// registered library gets one TranslationTable. Use the map from -/// __tgt_offload_entry so that we may quickly determine whether we -/// are trying to (re)register an existing lib or really have a new one. -struct TranslationTable { - __tgt_target_table HostTable; - - // Image assigned to a given device. - std::vector<__tgt_device_image *> TargetsImages; // One image per device ID. - - // Table of entry points or NULL if it was not already computed. - std::vector<__tgt_target_table *> TargetsTable; // One table per device ID. -}; -typedef std::map<__tgt_offload_entry *, TranslationTable> - HostEntriesBeginToTransTableTy; -extern HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable; -extern std::mutex *TrlTblMtx; - -/// Map between the host ptr and a table index -struct TableMap { - TranslationTable *Table = nullptr; // table associated with the host ptr. - uint32_t Index = 0; // index in which the host ptr translated entry is found. - TableMap() = default; - TableMap(TranslationTable *table, uint32_t index) - : Table(table), Index(index) {} -}; -typedef std::map HostPtrToTableMapTy; -extern HostPtrToTableMapTy *HostPtrToTableMap; -extern std::mutex *TblMapMtx; - -#endif +//===------------ rtl.h - Target independent OpenMP target RTL ------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Declarations for handling RTL plugins. +// +//===----------------------------------------------------------------------===// + +#ifndef _OMPTARGET_RTL_H +#define _OMPTARGET_RTL_H + +#include "omptarget.h" +#include +#include +#include +#include +#include + +// Forward declarations. +struct DeviceTy; +struct __tgt_bin_desc; + +struct RTLInfoTy { + typedef int32_t(is_valid_binary_ty)(void *); + typedef int32_t(number_of_devices_ty)(); + typedef int32_t(init_device_ty)(int32_t); + typedef __tgt_target_table *(load_binary_ty)(int32_t, void *); + typedef void *(data_alloc_ty)(int32_t, int64_t, void *); + typedef int32_t(data_submit_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_submit_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(data_retrieve_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_retrieve_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(data_transfer_ty)(int32_t, void *, void *, int64_t); + typedef int32_t(data_transfer_async_ty)(int32_t, void *, void *, int64_t, + __tgt_async_info *); + typedef int32_t(data_delete_ty)(int32_t, void *); + typedef int32_t(run_region_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t); + typedef int32_t(run_region_async_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t, __tgt_async_info *); + typedef int32_t(run_team_region_ty)(int32_t, void *, void **, ptrdiff_t *, + int32_t, int32_t, int32_t, uint64_t); + typedef int32_t(run_team_region_async_ty)(int32_t, void *, void **, + ptrdiff_t *, int32_t, int32_t, + int32_t, uint64_t, + __tgt_async_info *); + typedef int64_t(init_requires_ty)(int64_t); + typedef int64_t(synchronize_ty)(int64_t, __tgt_async_info *); + + int32_t Idx = -1; // RTL index, index is the number of devices + // of other RTLs that were registered before, + // i.e. the OpenMP index of the first device + // to be registered with this RTL. + int32_t NumberOfDevices = -1; // Number of devices this RTL deals with. + + void *LibraryHandler = nullptr; + + + std::string RTLName; + + + // Functions implemented in the RTL. + is_valid_binary_ty *is_valid_binary = nullptr; + number_of_devices_ty *number_of_devices = nullptr; + init_device_ty *init_device = nullptr; + load_binary_ty *load_binary = nullptr; + data_alloc_ty *data_alloc = nullptr; + data_submit_ty *data_submit = nullptr; + data_submit_async_ty *data_submit_async = nullptr; + data_retrieve_ty *data_retrieve = nullptr; + data_retrieve_async_ty *data_retrieve_async = nullptr; + data_transfer_ty *data_retrieve = nullptr; + data_transfer_async_ty *data_retrieve_async = nullptr; + data_delete_ty *data_delete = nullptr; + run_region_ty *run_region = nullptr; + run_region_async_ty *run_region_async = nullptr; + run_team_region_ty *run_team_region = nullptr; + run_team_region_async_ty *run_team_region_async = nullptr; + init_requires_ty *init_requires = nullptr; + synchronize_ty *synchronize = nullptr; + + // Are there images associated with this RTL. + bool isUsed = false; + + // Mutex for thread-safety when calling RTL interface functions. + // It is easier to enforce thread-safety at the libomptarget level, + // so that developers of new RTLs do not have to worry about it. + std::mutex Mtx; + + // The existence of the mutex above makes RTLInfoTy non-copyable. + // We need to provide a copy constructor explicitly. + RTLInfoTy() = default; + + RTLInfoTy(const RTLInfoTy &r) { + Idx = r.Idx; + NumberOfDevices = r.NumberOfDevices; + LibraryHandler = r.LibraryHandler; +#ifdef OMPTARGET_DEBUG + RTLName = r.RTLName; +#endif + is_valid_binary = r.is_valid_binary; + number_of_devices = r.number_of_devices; + init_device = r.init_device; + load_binary = r.load_binary; + data_alloc = r.data_alloc; + data_submit = r.data_submit; + data_submit_async = r.data_submit_async; + data_retrieve = r.data_retrieve; + data_retrieve_async = r.data_retrieve_async; + data_transfer = r.data_transfer; + data_transfer_async = r.data_transfer_async; + data_delete = r.data_delete; + run_region = r.run_region; + run_region_async = r.run_region_async; + run_team_region = r.run_team_region; + run_team_region_async = r.run_team_region_async; + init_requires = r.init_requires; + isUsed = r.isUsed; + synchronize = r.synchronize; + } +}; + +/// RTLs identified in the system. +class RTLsTy { +private: + // Mutex-like object to guarantee thread-safety and unique initialization + // (i.e. the library attempts to load the RTLs (plugins) only once). + std::once_flag initFlag; + void LoadRTLs(); // not thread-safe + +public: + // List of the detected runtime libraries. + std::list AllRTLs; + + // Array of pointers to the detected runtime libraries that have compatible + // binaries. + std::vector UsedRTLs; + + int64_t RequiresFlags = OMP_REQ_UNDEFINED; + + explicit RTLsTy() = default; + + // Register the clauses of the requires directive. + void RegisterRequires(int64_t flags); + + // Register a shared library with all (compatible) RTLs. + void RegisterLib(__tgt_bin_desc *desc); + + // Unregister a shared library from all RTLs. + void UnregisterLib(__tgt_bin_desc *desc); +}; +extern RTLsTy *RTLs; +extern std::mutex *RTLsMtx; + + +/// Map between the host entry begin and the translation table. Each +/// registered library gets one TranslationTable. Use the map from +/// __tgt_offload_entry so that we may quickly determine whether we +/// are trying to (re)register an existing lib or really have a new one. +struct TranslationTable { + __tgt_target_table HostTable; + + // Image assigned to a given device. + std::vector<__tgt_device_image *> TargetsImages; // One image per device ID. + + // Table of entry points or NULL if it was not already computed. + std::vector<__tgt_target_table *> TargetsTable; // One table per device ID. +}; +typedef std::map<__tgt_offload_entry *, TranslationTable> + HostEntriesBeginToTransTableTy; +extern HostEntriesBeginToTransTableTy *HostEntriesBeginToTransTable; +extern std::mutex *TrlTblMtx; + +/// Map between the host ptr and a table index +struct TableMap { + TranslationTable *Table = nullptr; // table associated with the host ptr. + uint32_t Index = 0; // index in which the host ptr translated entry is found. + TableMap() = default; + TableMap(TranslationTable *table, uint32_t index) + : Table(table), Index(index) {} +}; +typedef std::map HostPtrToTableMapTy; +extern HostPtrToTableMapTy *HostPtrToTableMap; +extern std::mutex *TblMapMtx; + +#endif diff --git a/openmp/libomptarget/test/CMakeLists.txt b/openmp/libomptarget/test/CMakeLists.txt index aa3fffcfe60ef..de6a6308740f3 100644 --- a/openmp/libomptarget/test/CMakeLists.txt +++ b/openmp/libomptarget/test/CMakeLists.txt @@ -1,19 +1,19 @@ -# CMakeLists.txt file for unit testing OpenMP offloading runtime library. -if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR - OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0) - libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.") - libomptarget_warning_say("The check-libomptarget target will not be available!") - return() -endif() - -if(LIBOMPTARGET_ENABLE_DEBUG) - set(LIBOMPTARGET_DEBUG True) -else() - set(LIBOMPTARGET_DEBUG False) -endif() - -add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp ${LIBOMPTARGET_TESTED_PLUGINS}) - -# Configure the lit.site.cfg.in file -set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!") -configure_file(lit.site.cfg.in lit.site.cfg @ONLY) +# CMakeLists.txt file for unit testing OpenMP offloading runtime library. +if(NOT OPENMP_TEST_COMPILER_ID STREQUAL "Clang" OR + OPENMP_TEST_COMPILER_VERSION VERSION_LESS 6.0.0) + libomptarget_say("Can only test with Clang compiler in version 6.0.0 or later.") + libomptarget_warning_say("The check-libomptarget target will not be available!") + return() +endif() + +if(LIBOMPTARGET_ENABLE_DEBUG) + set(LIBOMPTARGET_DEBUG True) +else() + set(LIBOMPTARGET_DEBUG False) +endif() + +add_openmp_testsuite(check-libomptarget "Running libomptarget tests" ${CMAKE_CURRENT_BINARY_DIR} DEPENDS omptarget omp ${LIBOMPTARGET_TESTED_PLUGINS}) + +# Configure the lit.site.cfg.in file +set(AUTO_GEN_COMMENT "## Autogenerated by libomptarget configuration.\n# Do not edit!") +configure_file(lit.site.cfg.in lit.site.cfg @ONLY) diff --git a/openmp/libomptarget/test/api/omp_get_num_devices.c b/openmp/libomptarget/test/api/omp_get_num_devices.c index d0e84db6b1081..b121847151226 100644 --- a/openmp/libomptarget/test/api/omp_get_num_devices.c +++ b/openmp/libomptarget/test/api/omp_get_num_devices.c @@ -1,36 +1,36 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -int test_omp_get_num_devices() -{ - /* checks that omp_get_num_devices() > 0 */ - int num_devices = omp_get_num_devices(); - printf("num_devices = %d\n", num_devices); - - #pragma omp target - {} - - return (num_devices > 0); -} - -int main() -{ - int i; - int failed=0; - - if (!test_omp_get_num_devices()) { - failed++; - } - if (failed) - printf("FAIL\n"); - else - printf("PASS\n"); - return failed; -} - -// CHECK: PASS +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +int test_omp_get_num_devices() +{ + /* checks that omp_get_num_devices() > 0 */ + int num_devices = omp_get_num_devices(); + printf("num_devices = %d\n", num_devices); + + #pragma omp target + {} + + return (num_devices > 0); +} + +int main() +{ + int i; + int failed=0; + + if (!test_omp_get_num_devices()) { + failed++; + } + if (failed) + printf("FAIL\n"); + else + printf("PASS\n"); + return failed; +} + +// CHECK: PASS diff --git a/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c b/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c index 85dcb73f11490..fb4d7036c417c 100644 --- a/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c +++ b/openmp/libomptarget/test/api/omp_get_num_devices_with_empty_target.c @@ -1,30 +1,30 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -static int test_omp_get_num_devices_with_empty_target() { - /* checks that omp_get_num_devices() > 0 */ - return omp_get_num_devices() > 0; -} - -int main() { - int failed = 0; - - if (!test_omp_get_num_devices_with_empty_target()) { - ++failed; - } - - if (failed) { - printf("FAIL\n"); - } else { - printf("PASS\n"); - } - - return failed; -} - -// CHECK: PASS +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +static int test_omp_get_num_devices_with_empty_target() { + /* checks that omp_get_num_devices() > 0 */ + return omp_get_num_devices() > 0; +} + +int main() { + int failed = 0; + + if (!test_omp_get_num_devices_with_empty_target()) { + ++failed; + } + + if (failed) { + printf("FAIL\n"); + } else { + printf("PASS\n"); + } + + return failed; +} + +// CHECK: PASS diff --git a/openmp/libomptarget/test/env/omp_target_debug.c b/openmp/libomptarget/test/env/omp_target_debug.c index ce84c9842f64f..4ad503f258e2c 100644 --- a/openmp/libomptarget/test/env/omp_target_debug.c +++ b/openmp/libomptarget/test/env/omp_target_debug.c @@ -1,20 +1,20 @@ -// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=NDEBUG -// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG -// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG -// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG -// REQUIRES: libomptarget-debug - -int main(void) { -#pragma omp target - {} - return 0; -} - -// DEBUG: Libomptarget -// NDEBUG-NOT: Libomptarget -// NDEBUG-NOT: Target - +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=NDEBUG +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=NDEBUG +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=0 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=NDEBUG +// REQUIRES: libomptarget-debug + +int main(void) { +#pragma omp target + {} + return 0; +} + +// DEBUG: Libomptarget +// NDEBUG-NOT: Libomptarget +// NDEBUG-NOT: Target + diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg index 43116055c82b0..d6ba85080d963 100644 --- a/openmp/libomptarget/test/lit.cfg +++ b/openmp/libomptarget/test/lit.cfg @@ -1,142 +1,142 @@ -# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: -# Configuration file for the 'lit' test runner. - -import os -import lit.formats - -# Tell pylint that we know config and lit_config exist somewhere. -if 'PYLINT_IMPORT' in os.environ: - config = object() - lit_config = object() - -def append_dynamic_library_path(name, value, sep): - if name in config.environment: - config.environment[name] = value + sep + config.environment[name] - else: - config.environment[name] = value - -# name: The name of this test suite. -config.name = 'libomptarget' - -# suffixes: A list of file extensions to treat as test files. -config.suffixes = ['.c', '.cpp', '.cc'] - -# test_source_root: The root path where tests are located. -config.test_source_root = os.path.dirname(__file__) - -# test_exec_root: The root object directory where output is placed -config.test_exec_root = config.libomptarget_obj_root - -# test format -config.test_format = lit.formats.ShTest() - -# compiler flags -config.test_flags = " -I " + config.test_source_root + \ - " -I " + config.omp_header_directory + \ - " -L " + config.library_dir; - -if config.omp_host_rtl_directory: - config.test_flags = config.test_flags + " -L " + \ - config.omp_host_rtl_directory - -config.test_flags = config.test_flags + " " + config.test_extra_flags - -# Allow REQUIRES / UNSUPPORTED / XFAIL to work -config.target_triple = [ ] -for feature in config.test_compiler_features: - config.available_features.add(feature) - -if config.libomptarget_debug: - config.available_features.add('libomptarget-debug') - -# Setup environment to find dynamic library at runtime -if config.operating_system == 'Windows': - append_dynamic_library_path('PATH', config.library_dir, ";") - append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";") -elif config.operating_system == 'Darwin': - append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":") - append_dynamic_library_path('DYLD_LIBRARY_PATH', \ - config.omp_host_rtl_directory, ";") - config.test_flags += " -Wl,-rpath," + config.library_dir - config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory -else: # Unices - append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":") - append_dynamic_library_path('LD_LIBRARY_PATH', \ - config.omp_host_rtl_directory, ":") - -# substitutions -# - for targets that exist in the system create the actual command. -# - for valid targets that do not exist in the system, return false, so that the -# same test can be used for different targets. - -# Scan all the valid targets. -for libomptarget_target in config.libomptarget_all_targets: - # Is this target in the current system? If so create a compile, run and test - # command. Otherwise create command that return false. - if libomptarget_target in config.libomptarget_system_targets: - config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ - libomptarget_target, \ - "%libomptarget-compilexx-and-run-" + libomptarget_target + \ - " | " + config.libomptarget_filecheck + " %s")) - config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ - libomptarget_target, \ - "%libomptarget-compile-and-run-" + libomptarget_target + \ - " | " + config.libomptarget_filecheck + " %s")) - config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ - libomptarget_target, \ - "%libomptarget-compilexx-" + libomptarget_target + " && " + \ - "%libomptarget-run-" + libomptarget_target)) - config.substitutions.append(("%libomptarget-compile-and-run-" + \ - libomptarget_target, \ - "%libomptarget-compile-" + libomptarget_target + " && " + \ - "%libomptarget-run-" + libomptarget_target)) - config.substitutions.append(("%libomptarget-compilexx-" + \ - libomptarget_target, \ - "%clangxx-" + libomptarget_target + " %s -o %t-" + \ - libomptarget_target)) - config.substitutions.append(("%libomptarget-compile-" + \ - libomptarget_target, \ - "%clang-" + libomptarget_target + " %s -o %t-" + \ - libomptarget_target)) - config.substitutions.append(("%libomptarget-run-" + \ - libomptarget_target, \ - "%t-" + libomptarget_target)) - config.substitutions.append(("%clangxx-" + libomptarget_target, \ - "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target)) - config.substitutions.append(("%clang-" + libomptarget_target, \ - "%clang %openmp_flags %flags -fopenmp-targets=" + libomptarget_target)) - config.substitutions.append(("%fcheck-" + libomptarget_target, \ - config.libomptarget_filecheck + " %s")) - else: - config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ - libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ - libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%libomptarget-compile-and-run-" + \ - libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ - libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%libomptarget-compilexx-" + \ - libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%libomptarget-compile-" + \ - libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%libomptarget-run-" + \ - libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%clang-" + libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%clangxx-" + libomptarget_target, \ - "echo ignored-command")) - config.substitutions.append(("%fcheck-" + libomptarget_target, \ - "echo ignored-command")) - -config.substitutions.append(("%clangxx", config.test_cxx_compiler)) -config.substitutions.append(("%clang", config.test_c_compiler)) -config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) -config.substitutions.append(("%flags", config.test_flags)) +# -*- Python -*- vim: set ft=python ts=4 sw=4 expandtab tw=79: +# Configuration file for the 'lit' test runner. + +import os +import lit.formats + +# Tell pylint that we know config and lit_config exist somewhere. +if 'PYLINT_IMPORT' in os.environ: + config = object() + lit_config = object() + +def append_dynamic_library_path(name, value, sep): + if name in config.environment: + config.environment[name] = value + sep + config.environment[name] + else: + config.environment[name] = value + +# name: The name of this test suite. +config.name = 'libomptarget' + +# suffixes: A list of file extensions to treat as test files. +config.suffixes = ['.c', '.cpp', '.cc'] + +# test_source_root: The root path where tests are located. +config.test_source_root = os.path.dirname(__file__) + +# test_exec_root: The root object directory where output is placed +config.test_exec_root = config.libomptarget_obj_root + +# test format +config.test_format = lit.formats.ShTest() + +# compiler flags +config.test_flags = " -I " + config.test_source_root + \ + " -I " + config.omp_header_directory + \ + " -L " + config.library_dir; + +if config.omp_host_rtl_directory: + config.test_flags = config.test_flags + " -L " + \ + config.omp_host_rtl_directory + +config.test_flags = config.test_flags + " " + config.test_extra_flags + +# Allow REQUIRES / UNSUPPORTED / XFAIL to work +config.target_triple = [ ] +for feature in config.test_compiler_features: + config.available_features.add(feature) + +if config.libomptarget_debug: + config.available_features.add('libomptarget-debug') + +# Setup environment to find dynamic library at runtime +if config.operating_system == 'Windows': + append_dynamic_library_path('PATH', config.library_dir, ";") + append_dynamic_library_path('PATH', config.omp_host_rtl_directory, ";") +elif config.operating_system == 'Darwin': + append_dynamic_library_path('DYLD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('DYLD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ";") + config.test_flags += " -Wl,-rpath," + config.library_dir + config.test_flags += " -Wl,-rpath," + config.omp_host_rtl_directory +else: # Unices + append_dynamic_library_path('LD_LIBRARY_PATH', config.library_dir, ":") + append_dynamic_library_path('LD_LIBRARY_PATH', \ + config.omp_host_rtl_directory, ":") + +# substitutions +# - for targets that exist in the system create the actual command. +# - for valid targets that do not exist in the system, return false, so that the +# same test can be used for different targets. + +# Scan all the valid targets. +for libomptarget_target in config.libomptarget_all_targets: + # Is this target in the current system? If so create a compile, run and test + # command. Otherwise create command that return false. + if libomptarget_target in config.libomptarget_system_targets: + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compilexx-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "%libomptarget-compile-and-run-" + libomptarget_target + \ + " | " + config.libomptarget_filecheck + " %s")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "%libomptarget-compilexx-" + libomptarget_target + " && " + \ + "%libomptarget-run-" + libomptarget_target)) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "%libomptarget-compile-" + libomptarget_target + " && " + \ + "%libomptarget-run-" + libomptarget_target)) + config.substitutions.append(("%libomptarget-compilexx-" + \ + libomptarget_target, \ + "%clangxx-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target)) + config.substitutions.append(("%libomptarget-compile-" + \ + libomptarget_target, \ + "%clang-" + libomptarget_target + " %s -o %t-" + \ + libomptarget_target)) + config.substitutions.append(("%libomptarget-run-" + \ + libomptarget_target, \ + "%t-" + libomptarget_target)) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "%clangxx %openmp_flags %flags -fopenmp-targets=" + libomptarget_target)) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "%clang %openmp_flags %flags -fopenmp-targets=" + libomptarget_target)) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + config.libomptarget_filecheck + " %s")) + else: + config.substitutions.append(("%libomptarget-compile-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-run-and-check-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compile-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-and-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compilexx-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-compile-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%libomptarget-run-" + \ + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clang-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%clangxx-" + libomptarget_target, \ + "echo ignored-command")) + config.substitutions.append(("%fcheck-" + libomptarget_target, \ + "echo ignored-command")) + +config.substitutions.append(("%clangxx", config.test_cxx_compiler)) +config.substitutions.append(("%clang", config.test_c_compiler)) +config.substitutions.append(("%openmp_flags", config.test_openmp_flags)) +config.substitutions.append(("%flags", config.test_flags)) diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in index 26ef4920d91ee..c8aff49aa6a90 100644 --- a/openmp/libomptarget/test/lit.site.cfg.in +++ b/openmp/libomptarget/test/lit.site.cfg.in @@ -1,19 +1,19 @@ -@AUTO_GEN_COMMENT@ - -config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" -config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" -config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@ -config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@" -config.test_extra_flags = "@OPENMP_TEST_FLAGS@" -config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@" -config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" -config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" -config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" -config.operating_system = "@CMAKE_SYSTEM_NAME@" -config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split() -config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split() -config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@" -config.libomptarget_debug = @LIBOMPTARGET_DEBUG@ - -# Let the main config do the real work. -lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") +@AUTO_GEN_COMMENT@ + +config.test_c_compiler = "@OPENMP_TEST_C_COMPILER@" +config.test_cxx_compiler = "@OPENMP_TEST_CXX_COMPILER@" +config.test_compiler_features = @OPENMP_TEST_COMPILER_FEATURES@ +config.test_openmp_flags = "@OPENMP_TEST_OPENMP_FLAGS@" +config.test_extra_flags = "@OPENMP_TEST_FLAGS@" +config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@" +config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@" +config.omp_header_directory = "@LIBOMPTARGET_OPENMP_HEADER_FOLDER@" +config.omp_host_rtl_directory = "@LIBOMPTARGET_OPENMP_HOST_RTL_FOLDER@" +config.operating_system = "@CMAKE_SYSTEM_NAME@" +config.libomptarget_all_targets = "@LIBOMPTARGET_ALL_TARGETS@".split() +config.libomptarget_system_targets = "@LIBOMPTARGET_SYSTEM_TARGETS@".split() +config.libomptarget_filecheck = "@OPENMP_FILECHECK_EXECUTABLE@" +config.libomptarget_debug = @LIBOMPTARGET_DEBUG@ + +# Let the main config do the real work. +lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg") diff --git a/openmp/libomptarget/test/mapping/declare_mapper_api.cpp b/openmp/libomptarget/test/mapping/declare_mapper_api.cpp index 275b6c3c57025..9e4447ce2d4b8 100644 --- a/openmp/libomptarget/test/mapping/declare_mapper_api.cpp +++ b/openmp/libomptarget/test/mapping/declare_mapper_api.cpp @@ -1,47 +1,47 @@ -// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu - -#include -#include -#include - -// Data structure definitions copied from OpenMP RTL. -struct MapComponentInfoTy { - void *Base; - void *Begin; - int64_t Size; - int64_t Type; - MapComponentInfoTy() = default; - MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type) - : Base(Base), Begin(Begin), Size(Size), Type(Type) {} -}; - -struct MapperComponentsTy { - std::vector Components; -}; - -// OpenMP RTL interfaces -#ifdef __cplusplus -extern "C" { -#endif -int64_t __tgt_mapper_num_components(void *rt_mapper_handle); -void __tgt_push_mapper_component(void *rt_mapper_handle, void *base, - void *begin, int64_t size, int64_t type); -#ifdef __cplusplus -} -#endif - -int main(int argc, char *argv[]) { - MapperComponentsTy MC; - void *base, *begin; - int64_t size, type; - // Push 2 elements into MC. - __tgt_push_mapper_component((void *)&MC, base, begin, size, type); - __tgt_push_mapper_component((void *)&MC, base, begin, size, type); - int64_t num = __tgt_mapper_num_components((void *)&MC); - // CHECK: num=2 - printf("num=%lld\n", num); - return 0; -} +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu + +#include +#include +#include + +// Data structure definitions copied from OpenMP RTL. +struct MapComponentInfoTy { + void *Base; + void *Begin; + int64_t Size; + int64_t Type; + MapComponentInfoTy() = default; + MapComponentInfoTy(void *Base, void *Begin, int64_t Size, int64_t Type) + : Base(Base), Begin(Begin), Size(Size), Type(Type) {} +}; + +struct MapperComponentsTy { + std::vector Components; +}; + +// OpenMP RTL interfaces +#ifdef __cplusplus +extern "C" { +#endif +int64_t __tgt_mapper_num_components(void *rt_mapper_handle); +void __tgt_push_mapper_component(void *rt_mapper_handle, void *base, + void *begin, int64_t size, int64_t type); +#ifdef __cplusplus +} +#endif + +int main(int argc, char *argv[]) { + MapperComponentsTy MC; + void *base, *begin; + int64_t size, type; + // Push 2 elements into MC. + __tgt_push_mapper_component((void *)&MC, base, begin, size, type); + __tgt_push_mapper_component((void *)&MC, base, begin, size, type); + int64_t num = __tgt_mapper_num_components((void *)&MC); + // CHECK: num=2 + printf("num=%lld\n", num); + return 0; +} diff --git a/openmp/libomptarget/test/mapping/delete_inf_refcount.c b/openmp/libomptarget/test/mapping/delete_inf_refcount.c index b4106be04ab73..781ece71eb987 100644 --- a/openmp/libomptarget/test/mapping/delete_inf_refcount.c +++ b/openmp/libomptarget/test/mapping/delete_inf_refcount.c @@ -1,32 +1,32 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -#pragma omp declare target -int isHost; -#pragma omp end declare target - -int main(void) { - isHost = -1; - -#pragma omp target enter data map(to: isHost) - -#pragma omp target - { isHost = omp_is_initial_device(); } -#pragma omp target update from(isHost) - - if (isHost < 0) { - printf("Runtime error, isHost=%d\n", isHost); - } - -#pragma omp target exit data map(delete: isHost) - - // CHECK: Target region executed on the device - printf("Target region executed on the %s\n", isHost ? "host" : "device"); - - return isHost; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +#pragma omp declare target +int isHost; +#pragma omp end declare target + +int main(void) { + isHost = -1; + +#pragma omp target enter data map(to: isHost) + +#pragma omp target + { isHost = omp_is_initial_device(); } +#pragma omp target update from(isHost) + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + +#pragma omp target exit data map(delete: isHost) + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return isHost; +} diff --git a/openmp/libomptarget/test/mapping/pr38704.c b/openmp/libomptarget/test/mapping/pr38704.c index 3e7135e284114..fcb4afee9530e 100644 --- a/openmp/libomptarget/test/mapping/pr38704.c +++ b/openmp/libomptarget/test/mapping/pr38704.c @@ -1,47 +1,47 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -// Clang 6.0 doesn't use the new map interface, undefined behavior when -// the compiler emits "old" interface code for structures. -// UNSUPPORTED: clang-6 - -#include -#include - -typedef struct { - int *ptr1; - int *ptr2; -} StructWithPtrs; - -int main(int argc, char *argv[]) { - StructWithPtrs s, s2; - s.ptr1 = malloc(sizeof(int)); - s.ptr2 = malloc(2 * sizeof(int)); - s2.ptr1 = malloc(sizeof(int)); - s2.ptr2 = malloc(2 * sizeof(int)); - -#pragma omp target enter data map(to: s2.ptr2[0:1]) -#pragma omp target map(s.ptr1[0:1], s.ptr2[0:2]) - { - s.ptr1[0] = 1; - s.ptr2[0] = 2; - s.ptr2[1] = 3; - } -#pragma omp target exit data map(from: s2.ptr1[0:1], s2.ptr2[0:1]) - - // CHECK: s.ptr1[0] = 1 - // CHECK: s.ptr2[0] = 2 - // CHECK: s.ptr2[1] = 3 - printf("s.ptr1[0] = %d\n", s.ptr1[0]); - printf("s.ptr2[0] = %d\n", s.ptr2[0]); - printf("s.ptr2[1] = %d\n", s.ptr2[1]); - - free(s.ptr1); - free(s.ptr2); - free(s2.ptr1); - free(s2.ptr2); - - return 0; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +// Clang 6.0 doesn't use the new map interface, undefined behavior when +// the compiler emits "old" interface code for structures. +// UNSUPPORTED: clang-6 + +#include +#include + +typedef struct { + int *ptr1; + int *ptr2; +} StructWithPtrs; + +int main(int argc, char *argv[]) { + StructWithPtrs s, s2; + s.ptr1 = malloc(sizeof(int)); + s.ptr2 = malloc(2 * sizeof(int)); + s2.ptr1 = malloc(sizeof(int)); + s2.ptr2 = malloc(2 * sizeof(int)); + +#pragma omp target enter data map(to: s2.ptr2[0:1]) +#pragma omp target map(s.ptr1[0:1], s.ptr2[0:2]) + { + s.ptr1[0] = 1; + s.ptr2[0] = 2; + s.ptr2[1] = 3; + } +#pragma omp target exit data map(from: s2.ptr1[0:1], s2.ptr2[0:1]) + + // CHECK: s.ptr1[0] = 1 + // CHECK: s.ptr2[0] = 2 + // CHECK: s.ptr2[1] = 3 + printf("s.ptr1[0] = %d\n", s.ptr1[0]); + printf("s.ptr2[0] = %d\n", s.ptr2[0]); + printf("s.ptr2[1] = %d\n", s.ptr2[1]); + + free(s.ptr1); + free(s.ptr2); + free(s2.ptr1); + free(s2.ptr2); + + return 0; +} diff --git a/openmp/libomptarget/test/offloading/dynamic_module.c b/openmp/libomptarget/test/offloading/dynamic_module.c index 7f062b6d752c0..ae58ec2c9d07c 100644 --- a/openmp/libomptarget/test/offloading/dynamic_module.c +++ b/openmp/libomptarget/test/offloading/dynamic_module.c @@ -1,17 +1,17 @@ -// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-aarch64-unknown-linux-gnu %t.so && %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64le-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-x86_64-pc-linux-gnu %t.so && %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu - -#ifdef SHARED -void foo() {} -#else -#include -int main() { -#pragma omp target - ; - // CHECK: DONE. - printf("%s\n", "DONE."); - return 0; -} -#endif +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-aarch64-unknown-linux-gnu %t.so && %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-powerpc64le-ibm-linux-gnu %t.so && %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %libomptarget-compile-x86_64-pc-linux-gnu %t.so && %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu + +#ifdef SHARED +void foo() {} +#else +#include +int main() { +#pragma omp target + ; + // CHECK: DONE. + printf("%s\n", "DONE."); + return 0; +} +#endif diff --git a/openmp/libomptarget/test/offloading/dynamic_module_load.c b/openmp/libomptarget/test/offloading/dynamic_module_load.c index fe917e4fe1cfb..8c61464929963 100644 --- a/openmp/libomptarget/test/offloading/dynamic_module_load.c +++ b/openmp/libomptarget/test/offloading/dynamic_module_load.c @@ -1,34 +1,34 @@ -// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-aarch64-unknown-linux-gnu -ldl && %libomptarget-run-aarch64-unknown-linux-gnu %t.so 2>&1 | %fcheck-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64le-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64le-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-x86_64-pc-linux-gnu -ldl && %libomptarget-run-x86_64-pc-linux-gnu %t.so 2>&1 | %fcheck-x86_64-pc-linux-gnu - -#ifdef SHARED -#include -int foo() { -#pragma omp target - ; - printf("%s\n", "DONE."); - return 0; -} -#else -#include -#include -int main(int argc, char **argv) { - void *Handle = dlopen(argv[1], RTLD_NOW); - int (*Foo)(void); - - if (Handle == NULL) { - printf("dlopen() failed: %s\n", dlerror()); - return 1; - } - Foo = (int (*)(void)) dlsym(Handle, "foo"); - if (Handle == NULL) { - printf("dlsym() failed: %s\n", dlerror()); - return 1; - } - // CHECK: DONE. - // CHECK-NOT: {{abort|fault}} - return Foo(); -} -#endif +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-aarch64-unknown-linux-gnu -ldl && %libomptarget-run-aarch64-unknown-linux-gnu %t.so 2>&1 | %fcheck-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-powerpc64le-ibm-linux-gnu -ldl && %libomptarget-run-powerpc64le-ibm-linux-gnu %t.so 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu -DSHARED -shared -o %t.so && %clang %flags %s -o %t-x86_64-pc-linux-gnu -ldl && %libomptarget-run-x86_64-pc-linux-gnu %t.so 2>&1 | %fcheck-x86_64-pc-linux-gnu + +#ifdef SHARED +#include +int foo() { +#pragma omp target + ; + printf("%s\n", "DONE."); + return 0; +} +#else +#include +#include +int main(int argc, char **argv) { + void *Handle = dlopen(argv[1], RTLD_NOW); + int (*Foo)(void); + + if (Handle == NULL) { + printf("dlopen() failed: %s\n", dlerror()); + return 1; + } + Foo = (int (*)(void)) dlsym(Handle, "foo"); + if (Handle == NULL) { + printf("dlsym() failed: %s\n", dlerror()); + return 1; + } + // CHECK: DONE. + // CHECK-NOT: {{abort|fault}} + return Foo(); +} +#endif diff --git a/openmp/libomptarget/test/offloading/looptripcnt.c b/openmp/libomptarget/test/offloading/looptripcnt.c index 025231b0c6d32..855f47468c3e3 100644 --- a/openmp/libomptarget/test/offloading/looptripcnt.c +++ b/openmp/libomptarget/test/offloading/looptripcnt.c @@ -1,36 +1,36 @@ -// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG -// REQUIRES: libomptarget-debug - -/* - Test for looptripcount being popped from runtime stack. -*/ -#include -#include -int main() -{ - int N = 128; - int NN = 1024; - int num_teams[NN]; - int num_threads[NN]; - - printf("#pragma omp target teams distribute parallel for thread_limit(4)\n"); -#pragma omp target teams distribute parallel for thread_limit(4) - for (int j = 0; j< N; j++) { - num_threads[j] = omp_get_num_threads(); - num_teams[j] = omp_get_num_teams(); - } - printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]); -// DEBUG: loop trip count is 128 - printf("#pragma omp target teams distribute parallel for\n"); -#pragma omp target teams distribute parallel for - for (int j = 0; j< N; j++) { - num_threads[j] = omp_get_num_threads(); - num_teams[j] = omp_get_num_teams(); - } - printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]); -// DEBUG: loop trip count is 128 - return 0; -} +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG +// REQUIRES: libomptarget-debug + +/* + Test for looptripcount being popped from runtime stack. +*/ +#include +#include +int main() +{ + int N = 128; + int NN = 1024; + int num_teams[NN]; + int num_threads[NN]; + + printf("#pragma omp target teams distribute parallel for thread_limit(4)\n"); +#pragma omp target teams distribute parallel for thread_limit(4) + for (int j = 0; j< N; j++) { + num_threads[j] = omp_get_num_threads(); + num_teams[j] = omp_get_num_teams(); + } + printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]); +// DEBUG: loop trip count is 128 + printf("#pragma omp target teams distribute parallel for\n"); +#pragma omp target teams distribute parallel for + for (int j = 0; j< N; j++) { + num_threads[j] = omp_get_num_threads(); + num_teams[j] = omp_get_num_teams(); + } + printf("num_threads %d num_teams %d\n", num_threads[0], num_teams[0]); +// DEBUG: loop trip count is 128 + return 0; +} diff --git a/openmp/libomptarget/test/offloading/offloading_ext_success.c b/openmp/libomptarget/test/offloading/offloading_ext_success.c new file mode 100644 index 0000000000000..86e294feba5ad --- /dev/null +++ b/openmp/libomptarget/test/offloading/offloading_ext_success.c @@ -0,0 +1,50 @@ +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu + +#include +#include +#include +#define N 128 + +int main(void) { + int num_d = omp_get_num_devices(); + int d[num_d]; + int h = omp_get_initial_device(); + double *mem_dev_src; + double *mem_dev_dst; + + int rc = -1; + + if (num_d < 1) { + printf("no device in system\n"); + } else if (num_d == 1) { + printf("only one device in system\n"); + } else { + +// access all device number and offset +#pragma omp target parallel for + for (int i = 0; i < num_d; i++) { + d[i] = omp_get_device_num(); + } + + // memory allocation and initialization + mem_dev_src = (double *)omp_target_alloc(sizeof(double) * N, d[0]); + if (mem_dev_src == NULL) { + printf("mem allocation in src device failed\n"); + return -1; + } + for (int i = 0; i < N; i++) { + mem_dev_src[i] = (double)rand(); + } + mem_dev_dst = (double *)omp_target_alloc(sizeof(double) * N, d[1]); + if (mem_dev_dst == NULL) { + printf("mem allocation in src device failed\n"); + return -1; + } + + rc = omp_target_memcpy(mem_dev_dst, mem_dev_src, N, 0, 0, d[1], d[0]); + } + return rc; +} \ No newline at end of file diff --git a/openmp/libomptarget/test/offloading/offloading_success.c b/openmp/libomptarget/test/offloading/offloading_success.c index 12e78fac1f5a3..e5e108ea5d84f 100644 --- a/openmp/libomptarget/test/offloading/offloading_success.c +++ b/openmp/libomptarget/test/offloading/offloading_success.c @@ -1,23 +1,23 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -int main(void) { - int isHost = -1; - -#pragma omp target map(from: isHost) - { isHost = omp_is_initial_device(); } - - if (isHost < 0) { - printf("Runtime error, isHost=%d\n", isHost); - } - - // CHECK: Target region executed on the device - printf("Target region executed on the %s\n", isHost ? "host" : "device"); - - return isHost; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +int main(void) { + int isHost = -1; + +#pragma omp target map(from: isHost) + { isHost = omp_is_initial_device(); } + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return isHost; +} diff --git a/openmp/libomptarget/test/offloading/offloading_success.cpp b/openmp/libomptarget/test/offloading/offloading_success.cpp index eecd97a3f317d..1b84fa86e93b4 100644 --- a/openmp/libomptarget/test/offloading/offloading_success.cpp +++ b/openmp/libomptarget/test/offloading/offloading_success.cpp @@ -1,23 +1,23 @@ -// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -int main(void) { - int isHost = 0; - -#pragma omp target map(from: isHost) - { isHost = omp_is_initial_device(); } - - if (isHost < 0) { - printf("Runtime error, isHost=%d\n", isHost); - } - - // CHECK: Target region executed on the device - printf("Target region executed on the %s\n", isHost ? "host" : "device"); - - return isHost; -} +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +int main(void) { + int isHost = 0; + +#pragma omp target map(from: isHost) + { isHost = omp_is_initial_device(); } + + if (isHost < 0) { + printf("Runtime error, isHost=%d\n", isHost); + } + + // CHECK: Target region executed on the device + printf("Target region executed on the %s\n", isHost ? "host" : "device"); + + return isHost; +} diff --git a/openmp/libomptarget/test/offloading/parallel_offloading_map.c b/openmp/libomptarget/test/offloading/parallel_offloading_map.c index 3bd59574747d5..c4a766b95defa 100644 --- a/openmp/libomptarget/test/offloading/parallel_offloading_map.c +++ b/openmp/libomptarget/test/offloading/parallel_offloading_map.c @@ -1,41 +1,41 @@ -// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty -#include -#include - -int main(int argc, char *argv[]) { - const int num_threads = 64, N = 128; - int array[num_threads] = {0}; - -#pragma omp parallel for - for (int i = 0; i < num_threads; ++i) { - int tmp[N]; - - for (int j = 0; j < N; ++j) { - tmp[j] = i; - } - -#pragma omp target teams distribute parallel for map(tofrom : tmp) - for (int j = 0; j < N; ++j) { - tmp[j] += j; - } - - for (int j = 0; j < N; ++j) { - array[i] += tmp[j]; - } - } - - // Verify - for (int i = 0; i < num_threads; ++i) { - const int ref = (0 + N - 1) * N / 2 + i * N; - assert(array[i] == ref); - } - - printf("PASS\n"); - - return 0; -} - -// CHECK: PASS +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-aarch64-unknown-linux-gnu | %fcheck-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64-ibm-linux-gnu | %fcheck-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-powerpc64le-ibm-linux-gnu | %fcheck-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env OMP_MAX_ACTIVE_LEVELS=2 %libomptarget-run-x86_64-pc-linux-gnu | %fcheck-x86_64-pc-linux-gnu -allow-empty +#include +#include + +int main(int argc, char *argv[]) { + const int num_threads = 64, N = 128; + int array[num_threads] = {0}; + +#pragma omp parallel for + for (int i = 0; i < num_threads; ++i) { + int tmp[N]; + + for (int j = 0; j < N; ++j) { + tmp[j] = i; + } + +#pragma omp target teams distribute parallel for map(tofrom : tmp) + for (int j = 0; j < N; ++j) { + tmp[j] += j; + } + + for (int j = 0; j < N; ++j) { + array[i] += tmp[j]; + } + } + + // Verify + for (int i = 0; i < num_threads; ++i) { + const int ref = (0 + N - 1) * N / 2 + i * N; + assert(array[i] == ref); + } + + printf("PASS\n"); + + return 0; +} + +// CHECK: PASS diff --git a/openmp/libomptarget/test/offloading/requires.c b/openmp/libomptarget/test/offloading/requires.c index 079ce5cb9348c..6ebf22db97ecb 100644 --- a/openmp/libomptarget/test/offloading/requires.c +++ b/openmp/libomptarget/test/offloading/requires.c @@ -1,46 +1,46 @@ -// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG -// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG -// REQUIRES: libomptarget-debug - -/* - Test for the 'requires' clause check. - When a target region is used, the requires flags are set in the - runtime for the entire compilation unit. If the flags are set again, - (for whatever reason) the set must be consistent with previously - set values. -*/ -#include -#include - -// --------------------------------------------------------------------------- -// Various definitions copied from OpenMP RTL - -extern void __tgt_register_requires(int64_t); - -// End of definitions copied from OpenMP RTL. -// --------------------------------------------------------------------------- - -void run_reg_requires() { - // Before the target region is registered, the requires registers the status - // of the requires clauses. Since there are no requires clauses in this file - // the flags state can only be OMP_REQ_NONE i.e. 1. - - // This is the 2nd time this function is called so it should print the debug - // info belonging to the check. - __tgt_register_requires(1); - __tgt_register_requires(1); - // DEBUG: New requires flags 1 compatible with existing 1! -} - -// --------------------------------------------------------------------------- -int main() { - run_reg_requires(); - -// This also runs reg requires for the first time. -#pragma omp target - {} - - return 0; -} +// RUN: %libomptarget-compile-aarch64-unknown-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-aarch64-unknown-linux-gnu 2>&1 | %fcheck-aarch64-unknown-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64-ibm-linux-gnu 2>&1 | %fcheck-powerpc64-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-powerpc64le-ibm-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-powerpc64le-ibm-linux-gnu 2>&1 | %fcheck-powerpc64le-ibm-linux-gnu -allow-empty -check-prefix=DEBUG +// RUN: %libomptarget-compile-x86_64-pc-linux-gnu && env LIBOMPTARGET_DEBUG=1 %libomptarget-run-x86_64-pc-linux-gnu 2>&1 | %fcheck-x86_64-pc-linux-gnu -allow-empty -check-prefix=DEBUG +// REQUIRES: libomptarget-debug + +/* + Test for the 'requires' clause check. + When a target region is used, the requires flags are set in the + runtime for the entire compilation unit. If the flags are set again, + (for whatever reason) the set must be consistent with previously + set values. +*/ +#include +#include + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL + +extern void __tgt_register_requires(int64_t); + +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- + +void run_reg_requires() { + // Before the target region is registered, the requires registers the status + // of the requires clauses. Since there are no requires clauses in this file + // the flags state can only be OMP_REQ_NONE i.e. 1. + + // This is the 2nd time this function is called so it should print the debug + // info belonging to the check. + __tgt_register_requires(1); + __tgt_register_requires(1); + // DEBUG: New requires flags 1 compatible with existing 1! +} + +// --------------------------------------------------------------------------- +int main() { + run_reg_requires(); + +// This also runs reg requires for the first time. +#pragma omp target + {} + + return 0; +} diff --git a/openmp/libomptarget/test/offloading/target_depend_nowait.cpp b/openmp/libomptarget/test/offloading/target_depend_nowait.cpp index 2c1c7e7191882..636d076c59815 100644 --- a/openmp/libomptarget/test/offloading/target_depend_nowait.cpp +++ b/openmp/libomptarget/test/offloading/target_depend_nowait.cpp @@ -1,62 +1,62 @@ -// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -#define N 1024 - -int A[N]; -int B[N]; -int C[N]; -int main() { - for (int i = 0; i < N; i++) - A[i] = B[i] = i; - -#pragma omp parallel num_threads(2) - { - if (omp_get_thread_num() == 1) { -// map data A & B and move to -#pragma omp target enter data map(to : A, B) depend(out : A[0]) nowait - -// no data move since already mapped -#pragma omp target map(A, B) depend(out : A[0]) nowait - { - for (int i = 0; i < N; i++) - ++A[i]; - for (int i = 0; i < N; i++) - ++B[i]; - } - -// no data move since already mapped -#pragma omp target teams num_teams(1) map(A, B) depend(out : A[0]) nowait - { - for (int i = 0; i < N; i++) - ++A[i]; - for (int i = 0; i < N; i++) - ++B[i]; - } - -// A updated via update -#pragma omp target update from(A) depend(out : A[0]) nowait - -// B updated via exit, A just released -#pragma omp target exit data map(release \ - : A) map(from \ - : B) depend(out \ - : A[0]) nowait - } // if - } // parallel - - int Sum = 0; - for (int i = 0; i < N; i++) - Sum += A[i] + B[i]; - // Sum is 2 * N * (2 + N - 1 + 2) / 2 - // CHECK: Sum = 1051648. - printf("Sum = %d.\n", Sum); - - return Sum != 2 * N * (2 + N - 1 + 2) / 2; -} - +// RUN: %libomptarget-compilexx-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compilexx-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +#define N 1024 + +int A[N]; +int B[N]; +int C[N]; +int main() { + for (int i = 0; i < N; i++) + A[i] = B[i] = i; + +#pragma omp parallel num_threads(2) + { + if (omp_get_thread_num() == 1) { +// map data A & B and move to +#pragma omp target enter data map(to : A, B) depend(out : A[0]) nowait + +// no data move since already mapped +#pragma omp target map(A, B) depend(out : A[0]) nowait + { + for (int i = 0; i < N; i++) + ++A[i]; + for (int i = 0; i < N; i++) + ++B[i]; + } + +// no data move since already mapped +#pragma omp target teams num_teams(1) map(A, B) depend(out : A[0]) nowait + { + for (int i = 0; i < N; i++) + ++A[i]; + for (int i = 0; i < N; i++) + ++B[i]; + } + +// A updated via update +#pragma omp target update from(A) depend(out : A[0]) nowait + +// B updated via exit, A just released +#pragma omp target exit data map(release \ + : A) map(from \ + : B) depend(out \ + : A[0]) nowait + } // if + } // parallel + + int Sum = 0; + for (int i = 0; i < N; i++) + Sum += A[i] + B[i]; + // Sum is 2 * N * (2 + N - 1 + 2) / 2 + // CHECK: Sum = 1051648. + printf("Sum = %d.\n", Sum); + + return Sum != 2 * N * (2 + N - 1 + 2) / 2; +} + diff --git a/openmp/libomptarget/test/unified_shared_memory/api.c b/openmp/libomptarget/test/unified_shared_memory/api.c index b0a71ad358017..4a6af5eb4b903 100644 --- a/openmp/libomptarget/test/unified_shared_memory/api.c +++ b/openmp/libomptarget/test/unified_shared_memory/api.c @@ -1,164 +1,164 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -// --------------------------------------------------------------------------- -// Various definitions copied from OpenMP RTL - -extern void __tgt_register_requires(int64_t); - -// End of definitions copied from OpenMP RTL. -// --------------------------------------------------------------------------- - -#pragma omp requires unified_shared_memory - -#define N 1024 - -void init(int A[], int B[], int C[]) { - for (int i = 0; i < N; ++i) { - A[i] = 0; - B[i] = 1; - C[i] = i; - } -} - -int main(int argc, char *argv[]) { - const int device = omp_get_default_device(); - - // Manual registration of requires flags for Clang versions - // that do not support requires. - __tgt_register_requires(8); - - // CHECK: Initial device: -10 - printf("Initial device: %d\n", omp_get_initial_device()); - - // - // Target alloc & target memcpy - // - int A[N], B[N], C[N]; - - // Init - init(A, B, C); - - int *pA, *pB, *pC; - - // map ptrs - pA = &A[0]; - pB = &B[0]; - pC = &C[0]; - - int *d_A = (int *)omp_target_alloc(N * sizeof(int), device); - int *d_B = (int *)omp_target_alloc(N * sizeof(int), device); - int *d_C = (int *)omp_target_alloc(N * sizeof(int), device); - - // CHECK: omp_target_alloc succeeded - printf("omp_target_alloc %s\n", d_A && d_B && d_C ? "succeeded" : "failed"); - - omp_target_memcpy(d_B, pB, N * sizeof(int), 0, 0, device, - omp_get_initial_device()); - omp_target_memcpy(d_C, pC, N * sizeof(int), 0, 0, device, - omp_get_initial_device()); - -#pragma omp target is_device_ptr(d_A, d_B, d_C) device(device) - { -#pragma omp parallel for schedule(static, 1) - for (int i = 0; i < N; i++) { - d_A[i] = d_B[i] + d_C[i] + 1; - } - } - - omp_target_memcpy(pA, d_A, N * sizeof(int), 0, 0, omp_get_initial_device(), - device); - - // CHECK: Test omp_target_memcpy: Succeeded - int fail = 0; - for (int i = 0; i < N; ++i) { - if (A[i] != i + 2) - fail++; - } - if (fail) { - printf("Test omp_target_memcpy: Failed\n"); - } else { - printf("Test omp_target_memcpy: Succeeded\n"); - } - - // - // target_is_present and target_associate/disassociate_ptr - // - init(A, B, C); - - // CHECK: B is not present, associating it... - // CHECK: omp_target_associate_ptr B succeeded - if (!omp_target_is_present(B, device)) { - printf("B is not present, associating it...\n"); - int rc = omp_target_associate_ptr(B, d_B, N * sizeof(int), 0, device); - printf("omp_target_associate_ptr B %s\n", !rc ? "succeeded" : "failed"); - } - - // CHECK: C is not present, associating it... - // CHECK: omp_target_associate_ptr C succeeded - if (!omp_target_is_present(C, device)) { - printf("C is not present, associating it...\n"); - int rc = omp_target_associate_ptr(C, d_C, N * sizeof(int), 0, device); - printf("omp_target_associate_ptr C %s\n", !rc ? "succeeded" : "failed"); - } - -// CHECK: Inside target data: A is not present -// CHECK: Inside target data: B is present -// CHECK: Inside target data: C is present -#pragma omp target data map(from : B, C) device(device) - { - printf("Inside target data: A is%s present\n", - omp_target_is_present(A, device) ? "" : " not"); - printf("Inside target data: B is%s present\n", - omp_target_is_present(B, device) ? "" : " not"); - printf("Inside target data: C is%s present\n", - omp_target_is_present(C, device) ? "" : " not"); - -#pragma omp target map(from : A) device(device) - { -#pragma omp parallel for schedule(static, 1) - for (int i = 0; i < N; i++) - A[i] = B[i] + C[i] + 1; - } - } - - // CHECK: B is present, disassociating it... - // CHECK: omp_target_disassociate_ptr B succeeded - // CHECK: C is present, disassociating it... - // CHECK: omp_target_disassociate_ptr C succeeded - if (omp_target_is_present(B, device)) { - printf("B is present, disassociating it...\n"); - int rc = omp_target_disassociate_ptr(B, device); - printf("omp_target_disassociate_ptr B %s\n", !rc ? "succeeded" : "failed"); - } - if (omp_target_is_present(C, device)) { - printf("C is present, disassociating it...\n"); - int rc = omp_target_disassociate_ptr(C, device); - printf("omp_target_disassociate_ptr C %s\n", !rc ? "succeeded" : "failed"); - } - - // CHECK: Test omp_target_associate_ptr: Succeeded - fail = 0; - for (int i = 0; i < N; ++i) { - if (A[i] != i + 2) - fail++; - } - if (fail) { - printf("Test omp_target_associate_ptr: Failed\n"); - } else { - printf("Test omp_target_associate_ptr: Succeeded\n"); - } - - omp_target_free(d_A, device); - omp_target_free(d_B, device); - omp_target_free(d_C, device); - - printf("Done!\n"); - - return 0; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL + +extern void __tgt_register_requires(int64_t); + +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- + +#pragma omp requires unified_shared_memory + +#define N 1024 + +void init(int A[], int B[], int C[]) { + for (int i = 0; i < N; ++i) { + A[i] = 0; + B[i] = 1; + C[i] = i; + } +} + +int main(int argc, char *argv[]) { + const int device = omp_get_default_device(); + + // Manual registration of requires flags for Clang versions + // that do not support requires. + __tgt_register_requires(8); + + // CHECK: Initial device: -10 + printf("Initial device: %d\n", omp_get_initial_device()); + + // + // Target alloc & target memcpy + // + int A[N], B[N], C[N]; + + // Init + init(A, B, C); + + int *pA, *pB, *pC; + + // map ptrs + pA = &A[0]; + pB = &B[0]; + pC = &C[0]; + + int *d_A = (int *)omp_target_alloc(N * sizeof(int), device); + int *d_B = (int *)omp_target_alloc(N * sizeof(int), device); + int *d_C = (int *)omp_target_alloc(N * sizeof(int), device); + + // CHECK: omp_target_alloc succeeded + printf("omp_target_alloc %s\n", d_A && d_B && d_C ? "succeeded" : "failed"); + + omp_target_memcpy(d_B, pB, N * sizeof(int), 0, 0, device, + omp_get_initial_device()); + omp_target_memcpy(d_C, pC, N * sizeof(int), 0, 0, device, + omp_get_initial_device()); + +#pragma omp target is_device_ptr(d_A, d_B, d_C) device(device) + { +#pragma omp parallel for schedule(static, 1) + for (int i = 0; i < N; i++) { + d_A[i] = d_B[i] + d_C[i] + 1; + } + } + + omp_target_memcpy(pA, d_A, N * sizeof(int), 0, 0, omp_get_initial_device(), + device); + + // CHECK: Test omp_target_memcpy: Succeeded + int fail = 0; + for (int i = 0; i < N; ++i) { + if (A[i] != i + 2) + fail++; + } + if (fail) { + printf("Test omp_target_memcpy: Failed\n"); + } else { + printf("Test omp_target_memcpy: Succeeded\n"); + } + + // + // target_is_present and target_associate/disassociate_ptr + // + init(A, B, C); + + // CHECK: B is not present, associating it... + // CHECK: omp_target_associate_ptr B succeeded + if (!omp_target_is_present(B, device)) { + printf("B is not present, associating it...\n"); + int rc = omp_target_associate_ptr(B, d_B, N * sizeof(int), 0, device); + printf("omp_target_associate_ptr B %s\n", !rc ? "succeeded" : "failed"); + } + + // CHECK: C is not present, associating it... + // CHECK: omp_target_associate_ptr C succeeded + if (!omp_target_is_present(C, device)) { + printf("C is not present, associating it...\n"); + int rc = omp_target_associate_ptr(C, d_C, N * sizeof(int), 0, device); + printf("omp_target_associate_ptr C %s\n", !rc ? "succeeded" : "failed"); + } + +// CHECK: Inside target data: A is not present +// CHECK: Inside target data: B is present +// CHECK: Inside target data: C is present +#pragma omp target data map(from : B, C) device(device) + { + printf("Inside target data: A is%s present\n", + omp_target_is_present(A, device) ? "" : " not"); + printf("Inside target data: B is%s present\n", + omp_target_is_present(B, device) ? "" : " not"); + printf("Inside target data: C is%s present\n", + omp_target_is_present(C, device) ? "" : " not"); + +#pragma omp target map(from : A) device(device) + { +#pragma omp parallel for schedule(static, 1) + for (int i = 0; i < N; i++) + A[i] = B[i] + C[i] + 1; + } + } + + // CHECK: B is present, disassociating it... + // CHECK: omp_target_disassociate_ptr B succeeded + // CHECK: C is present, disassociating it... + // CHECK: omp_target_disassociate_ptr C succeeded + if (omp_target_is_present(B, device)) { + printf("B is present, disassociating it...\n"); + int rc = omp_target_disassociate_ptr(B, device); + printf("omp_target_disassociate_ptr B %s\n", !rc ? "succeeded" : "failed"); + } + if (omp_target_is_present(C, device)) { + printf("C is present, disassociating it...\n"); + int rc = omp_target_disassociate_ptr(C, device); + printf("omp_target_disassociate_ptr C %s\n", !rc ? "succeeded" : "failed"); + } + + // CHECK: Test omp_target_associate_ptr: Succeeded + fail = 0; + for (int i = 0; i < N; ++i) { + if (A[i] != i + 2) + fail++; + } + if (fail) { + printf("Test omp_target_associate_ptr: Failed\n"); + } else { + printf("Test omp_target_associate_ptr: Succeeded\n"); + } + + omp_target_free(d_A, device); + omp_target_free(d_B, device); + omp_target_free(d_C, device); + + printf("Done!\n"); + + return 0; +} diff --git a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c index 4cedbae36004b..39d185a7be751 100644 --- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c @@ -1,95 +1,95 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 - -#include -#include - -#pragma omp requires unified_shared_memory - -#define N 1024 - -int main(int argc, char *argv[]) { - int fails; - void *host_alloc = 0, *device_alloc = 0; - int *a = (int *)malloc(N * sizeof(int)); - - // Init - for (int i = 0; i < N; ++i) { - a[i] = 10; - } - host_alloc = &a[0]; - - // - // map + target no close - // -#pragma omp target data map(tofrom : a[ : N]) map(tofrom : device_alloc) - { -#pragma omp target map(tofrom : device_alloc) - { device_alloc = &a[0]; } - } - - // CHECK: a used from unified memory. - if (device_alloc == host_alloc) - printf("a used from unified memory.\n"); - - // - // map + target with close - // - device_alloc = 0; -#pragma omp target data map(close, tofrom : a[ : N]) map(tofrom : device_alloc) - { -#pragma omp target map(tofrom : device_alloc) - { device_alloc = &a[0]; } - } - // CHECK: a copied to device. - if (device_alloc != host_alloc) - printf("a copied to device.\n"); - - // - // map + use_device_ptr no close - // - device_alloc = 0; -#pragma omp target data map(tofrom : a[ : N]) use_device_ptr(a) - { device_alloc = &a[0]; } - - // CHECK: a used from unified memory with use_device_ptr. - if (device_alloc == host_alloc) - printf("a used from unified memory with use_device_ptr.\n"); - - // - // map + use_device_ptr close - // - device_alloc = 0; -#pragma omp target data map(close, tofrom : a[ : N]) use_device_ptr(a) - { device_alloc = &a[0]; } - - // CHECK: a used from device memory with use_device_ptr. - if (device_alloc != host_alloc) - printf("a used from device memory with use_device_ptr.\n"); - - // - // map enter/exit + close - // - device_alloc = 0; -#pragma omp target enter data map(close, to : a[ : N]) - -#pragma omp target map(from : device_alloc) - { device_alloc = &a[0]; } - -#pragma omp target exit data map(from : a[ : N]) - - // CHECK: a has been mapped to the device. - if (device_alloc != host_alloc) - printf("a has been mapped to the device.\n"); - - free(a); - - // CHECK: Done! - printf("Done!\n"); - - return 0; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 + +#include +#include + +#pragma omp requires unified_shared_memory + +#define N 1024 + +int main(int argc, char *argv[]) { + int fails; + void *host_alloc = 0, *device_alloc = 0; + int *a = (int *)malloc(N * sizeof(int)); + + // Init + for (int i = 0; i < N; ++i) { + a[i] = 10; + } + host_alloc = &a[0]; + + // + // map + target no close + // +#pragma omp target data map(tofrom : a[ : N]) map(tofrom : device_alloc) + { +#pragma omp target map(tofrom : device_alloc) + { device_alloc = &a[0]; } + } + + // CHECK: a used from unified memory. + if (device_alloc == host_alloc) + printf("a used from unified memory.\n"); + + // + // map + target with close + // + device_alloc = 0; +#pragma omp target data map(close, tofrom : a[ : N]) map(tofrom : device_alloc) + { +#pragma omp target map(tofrom : device_alloc) + { device_alloc = &a[0]; } + } + // CHECK: a copied to device. + if (device_alloc != host_alloc) + printf("a copied to device.\n"); + + // + // map + use_device_ptr no close + // + device_alloc = 0; +#pragma omp target data map(tofrom : a[ : N]) use_device_ptr(a) + { device_alloc = &a[0]; } + + // CHECK: a used from unified memory with use_device_ptr. + if (device_alloc == host_alloc) + printf("a used from unified memory with use_device_ptr.\n"); + + // + // map + use_device_ptr close + // + device_alloc = 0; +#pragma omp target data map(close, tofrom : a[ : N]) use_device_ptr(a) + { device_alloc = &a[0]; } + + // CHECK: a used from device memory with use_device_ptr. + if (device_alloc != host_alloc) + printf("a used from device memory with use_device_ptr.\n"); + + // + // map enter/exit + close + // + device_alloc = 0; +#pragma omp target enter data map(close, to : a[ : N]) + +#pragma omp target map(from : device_alloc) + { device_alloc = &a[0]; } + +#pragma omp target exit data map(from : a[ : N]) + + // CHECK: a has been mapped to the device. + if (device_alloc != host_alloc) + printf("a has been mapped to the device.\n"); + + free(a); + + // CHECK: Done! + printf("Done!\n"); + + return 0; +} diff --git a/openmp/libomptarget/test/unified_shared_memory/close_manual.c b/openmp/libomptarget/test/unified_shared_memory/close_manual.c index 0417b8bf254e3..37a499cc7a342 100644 --- a/openmp/libomptarget/test/unified_shared_memory/close_manual.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_manual.c @@ -1,86 +1,86 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -// --------------------------------------------------------------------------- -// Various definitions copied from OpenMP RTL - -extern void __tgt_register_requires(int64_t); - -extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, - void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types); - -extern void __tgt_target_data_end(int64_t device_id, int32_t arg_num, - void **args_base, void **args, - int64_t *arg_sizes, int64_t *arg_types); - -// End of definitions copied from OpenMP RTL. -// --------------------------------------------------------------------------- - -#pragma omp requires unified_shared_memory - -#define N 1024 - -int main(int argc, char *argv[]) { - int fails; - void *host_alloc = 0, *device_alloc = 0; - int *a = (int *)malloc(N * sizeof(int)); - - // Manual registration of requires flags for Clang versions - // that do not support requires. - __tgt_register_requires(8); - - // Init - for (int i = 0; i < N; ++i) { - a[i] = 10; - } - host_alloc = &a[0]; - -// Dummy target region that ensures the runtime library is loaded when -// the target data begin/end functions are manually called below. -#pragma omp target - {} - - // Manual calls - int device_id = omp_get_default_device(); - int arg_num = 1; - void **args_base = (void **)&a; - void **args = (void **)&a; - int64_t arg_sizes[arg_num]; - - arg_sizes[0] = sizeof(int) * N; - - int64_t arg_types[arg_num]; - - // Ox400 enables the CLOSE map type in the runtime: - // OMP_TGT_MAPTYPE_CLOSE = 0x400 - // OMP_TGT_MAPTYPE_TO = 0x001 - arg_types[0] = 0x400 | 0x001; - - device_alloc = host_alloc; - - __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, - arg_types); - -#pragma omp target data use_device_ptr(a) - { device_alloc = a; } - - __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, - arg_types); - - // CHECK: a was copied to the device - if (device_alloc != host_alloc) - printf("a was copied to the device\n"); - - free(a); - - // CHECK: Done! - printf("Done!\n"); - - return 0; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL + +extern void __tgt_register_requires(int64_t); + +extern void __tgt_target_data_begin(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types); + +extern void __tgt_target_data_end(int64_t device_id, int32_t arg_num, + void **args_base, void **args, + int64_t *arg_sizes, int64_t *arg_types); + +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- + +#pragma omp requires unified_shared_memory + +#define N 1024 + +int main(int argc, char *argv[]) { + int fails; + void *host_alloc = 0, *device_alloc = 0; + int *a = (int *)malloc(N * sizeof(int)); + + // Manual registration of requires flags for Clang versions + // that do not support requires. + __tgt_register_requires(8); + + // Init + for (int i = 0; i < N; ++i) { + a[i] = 10; + } + host_alloc = &a[0]; + +// Dummy target region that ensures the runtime library is loaded when +// the target data begin/end functions are manually called below. +#pragma omp target + {} + + // Manual calls + int device_id = omp_get_default_device(); + int arg_num = 1; + void **args_base = (void **)&a; + void **args = (void **)&a; + int64_t arg_sizes[arg_num]; + + arg_sizes[0] = sizeof(int) * N; + + int64_t arg_types[arg_num]; + + // Ox400 enables the CLOSE map type in the runtime: + // OMP_TGT_MAPTYPE_CLOSE = 0x400 + // OMP_TGT_MAPTYPE_TO = 0x001 + arg_types[0] = 0x400 | 0x001; + + device_alloc = host_alloc; + + __tgt_target_data_begin(device_id, arg_num, args_base, args, arg_sizes, + arg_types); + +#pragma omp target data use_device_ptr(a) + { device_alloc = a; } + + __tgt_target_data_end(device_id, arg_num, args_base, args, arg_sizes, + arg_types); + + // CHECK: a was copied to the device + if (device_alloc != host_alloc) + printf("a was copied to the device\n"); + + free(a); + + // CHECK: Done! + printf("Done!\n"); + + return 0; +} diff --git a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c index b319c6b69ac29..a3ca71caf022f 100644 --- a/openmp/libomptarget/test/unified_shared_memory/close_modifier.c +++ b/openmp/libomptarget/test/unified_shared_memory/close_modifier.c @@ -1,135 +1,135 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 - -#include -#include - -#pragma omp requires unified_shared_memory - -#define N 1024 - -int main(int argc, char *argv[]) { - int fails; - void *host_alloc, *device_alloc; - void *host_data, *device_data; - int *alloc = (int *)malloc(N * sizeof(int)); - int data[N]; - - for (int i = 0; i < N; ++i) { - alloc[i] = 10; - data[i] = 1; - } - - host_data = &data[0]; - host_alloc = &alloc[0]; - -// -// Test that updates on the device are not visible to host -// when only a TO mapping is used. -// -#pragma omp target map(tofrom \ - : device_data, device_alloc) map(close, to \ - : alloc[:N], data \ - [:N]) - { - device_data = &data[0]; - device_alloc = &alloc[0]; - - for (int i = 0; i < N; i++) { - alloc[i] += 1; - data[i] += 1; - } - } - - // CHECK: Address of alloc on device different from host address. - if (device_alloc != host_alloc) - printf("Address of alloc on device different from host address.\n"); - - // CHECK: Address of data on device different from host address. - if (device_data != host_data) - printf("Address of data on device different from host address.\n"); - - // On the host, check that the arrays have been updated. - // CHECK: Alloc host values not updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (alloc[i] != 10) - fails++; - } - printf("Alloc host values not updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - - // CHECK: Data host values not updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (data[i] != 1) - fails++; - } - printf("Data host values not updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - - // - // Test that updates on the device are visible on host - // when a from is used. - // - - for (int i = 0; i < N; i++) { - alloc[i] += 1; - data[i] += 1; - } - -#pragma omp target map(close, tofrom : alloc[:N], data[:N]) - { - // CHECK: Alloc device values are correct: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (alloc[i] != 11) - fails++; - } - printf("Alloc device values are correct: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - // CHECK: Data device values are correct: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (data[i] != 2) - fails++; - } - printf("Data device values are correct: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - - // Update values on the device - for (int i = 0; i < N; i++) { - alloc[i] += 1; - data[i] += 1; - } - } - - // CHECK: Alloc host values updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (alloc[i] != 12) - fails++; - } - printf("Alloc host values updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - - // CHECK: Data host values updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (data[i] != 3) - fails++; - } - printf("Data host values updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - - free(alloc); - - // CHECK: Done! - printf("Done!\n"); - - return 0; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +// UNSUPPORTED: clang-6, clang-7, clang-8, clang-9 + +#include +#include + +#pragma omp requires unified_shared_memory + +#define N 1024 + +int main(int argc, char *argv[]) { + int fails; + void *host_alloc, *device_alloc; + void *host_data, *device_data; + int *alloc = (int *)malloc(N * sizeof(int)); + int data[N]; + + for (int i = 0; i < N; ++i) { + alloc[i] = 10; + data[i] = 1; + } + + host_data = &data[0]; + host_alloc = &alloc[0]; + +// +// Test that updates on the device are not visible to host +// when only a TO mapping is used. +// +#pragma omp target map(tofrom \ + : device_data, device_alloc) map(close, to \ + : alloc[:N], data \ + [:N]) + { + device_data = &data[0]; + device_alloc = &alloc[0]; + + for (int i = 0; i < N; i++) { + alloc[i] += 1; + data[i] += 1; + } + } + + // CHECK: Address of alloc on device different from host address. + if (device_alloc != host_alloc) + printf("Address of alloc on device different from host address.\n"); + + // CHECK: Address of data on device different from host address. + if (device_data != host_data) + printf("Address of data on device different from host address.\n"); + + // On the host, check that the arrays have been updated. + // CHECK: Alloc host values not updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 10) + fails++; + } + printf("Alloc host values not updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // CHECK: Data host values not updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 1) + fails++; + } + printf("Data host values not updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // + // Test that updates on the device are visible on host + // when a from is used. + // + + for (int i = 0; i < N; i++) { + alloc[i] += 1; + data[i] += 1; + } + +#pragma omp target map(close, tofrom : alloc[:N], data[:N]) + { + // CHECK: Alloc device values are correct: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 11) + fails++; + } + printf("Alloc device values are correct: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + // CHECK: Data device values are correct: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 2) + fails++; + } + printf("Data device values are correct: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // Update values on the device + for (int i = 0; i < N; i++) { + alloc[i] += 1; + data[i] += 1; + } + } + + // CHECK: Alloc host values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 12) + fails++; + } + printf("Alloc host values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // CHECK: Data host values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 3) + fails++; + } + printf("Data host values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + free(alloc); + + // CHECK: Done! + printf("Done!\n"); + + return 0; +} diff --git a/openmp/libomptarget/test/unified_shared_memory/shared_update.c b/openmp/libomptarget/test/unified_shared_memory/shared_update.c index 8036bc2f0405f..b27c79a1a67fa 100644 --- a/openmp/libomptarget/test/unified_shared_memory/shared_update.c +++ b/openmp/libomptarget/test/unified_shared_memory/shared_update.c @@ -1,114 +1,114 @@ -// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu -// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu - -#include -#include - -// --------------------------------------------------------------------------- -// Various definitions copied from OpenMP RTL - -extern void __tgt_register_requires(int64_t); - -// End of definitions copied from OpenMP RTL. -// --------------------------------------------------------------------------- - -#pragma omp requires unified_shared_memory - -#define N 1024 - -int main(int argc, char *argv[]) { - int fails; - void *host_alloc, *device_alloc; - void *host_data, *device_data; - int *alloc = (int *)malloc(N * sizeof(int)); - int data[N]; - - // Manual registration of requires flags for Clang versions - // that do not support requires. - __tgt_register_requires(8); - - for (int i = 0; i < N; ++i) { - alloc[i] = 10; - data[i] = 1; - } - - host_data = &data[0]; - host_alloc = &alloc[0]; - -// implicit mapping of data -#pragma omp target map(tofrom : device_data, device_alloc) - { - device_data = &data[0]; - device_alloc = &alloc[0]; - - for (int i = 0; i < N; i++) { - alloc[i] += 1; - data[i] += 1; - } - } - - // CHECK: Address of alloc on device matches host address. - if (device_alloc == host_alloc) - printf("Address of alloc on device matches host address.\n"); - - // CHECK: Address of data on device matches host address. - if (device_data == host_data) - printf("Address of data on device matches host address.\n"); - - // On the host, check that the arrays have been updated. - // CHECK: Alloc device values updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (alloc[i] != 11) - fails++; - } - printf("Alloc device values updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - - // CHECK: Data device values updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (data[i] != 2) - fails++; - } - printf("Data device values updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - - // - // Test that updates on the host snd on the device are both visible. - // - - // Update on the host. - for (int i = 0; i < N; ++i) { - alloc[i] += 1; - data[i] += 1; - } - -#pragma omp target - { - // CHECK: Alloc host values updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (alloc[i] != 12) - fails++; - } - printf("Alloc host values updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - // CHECK: Data host values updated: Succeeded - fails = 0; - for (int i = 0; i < N; i++) { - if (data[i] != 3) - fails++; - } - printf("Data host values updated: %s\n", - (fails == 0) ? "Succeeded" : "Failed"); - } - - free(alloc); - - printf("Done!\n"); - - return 0; -} +// RUN: %libomptarget-compile-run-and-check-aarch64-unknown-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-powerpc64le-ibm-linux-gnu +// RUN: %libomptarget-compile-run-and-check-x86_64-pc-linux-gnu + +#include +#include + +// --------------------------------------------------------------------------- +// Various definitions copied from OpenMP RTL + +extern void __tgt_register_requires(int64_t); + +// End of definitions copied from OpenMP RTL. +// --------------------------------------------------------------------------- + +#pragma omp requires unified_shared_memory + +#define N 1024 + +int main(int argc, char *argv[]) { + int fails; + void *host_alloc, *device_alloc; + void *host_data, *device_data; + int *alloc = (int *)malloc(N * sizeof(int)); + int data[N]; + + // Manual registration of requires flags for Clang versions + // that do not support requires. + __tgt_register_requires(8); + + for (int i = 0; i < N; ++i) { + alloc[i] = 10; + data[i] = 1; + } + + host_data = &data[0]; + host_alloc = &alloc[0]; + +// implicit mapping of data +#pragma omp target map(tofrom : device_data, device_alloc) + { + device_data = &data[0]; + device_alloc = &alloc[0]; + + for (int i = 0; i < N; i++) { + alloc[i] += 1; + data[i] += 1; + } + } + + // CHECK: Address of alloc on device matches host address. + if (device_alloc == host_alloc) + printf("Address of alloc on device matches host address.\n"); + + // CHECK: Address of data on device matches host address. + if (device_data == host_data) + printf("Address of data on device matches host address.\n"); + + // On the host, check that the arrays have been updated. + // CHECK: Alloc device values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 11) + fails++; + } + printf("Alloc device values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // CHECK: Data device values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 2) + fails++; + } + printf("Data device values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + + // + // Test that updates on the host snd on the device are both visible. + // + + // Update on the host. + for (int i = 0; i < N; ++i) { + alloc[i] += 1; + data[i] += 1; + } + +#pragma omp target + { + // CHECK: Alloc host values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (alloc[i] != 12) + fails++; + } + printf("Alloc host values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + // CHECK: Data host values updated: Succeeded + fails = 0; + for (int i = 0; i < N; i++) { + if (data[i] != 3) + fails++; + } + printf("Data host values updated: %s\n", + (fails == 0) ? "Succeeded" : "Failed"); + } + + free(alloc); + + printf("Done!\n"); + + return 0; +}