Home | History | Annotate | Line # | Download | only in Headers
      1 /*===---- __clang_cuda_runtime_wrapper.h - CUDA runtime support -------------===
      2  *
      3  * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
      4  * See https://llvm.org/LICENSE.txt for license information.
      5  * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
      6  *
      7  *===-----------------------------------------------------------------------===
      8  */
      9 
     10 /*
     11  * WARNING: This header is intended to be directly -include'd by
     12  * the compiler and is not supposed to be included by users.
     13  *
     14  * CUDA headers are implemented in a way that currently makes it
     15  * impossible for user code to #include directly when compiling with
     16  * Clang. They present different view of CUDA-supplied functions
     17  * depending on where in NVCC's compilation pipeline the headers are
     18  * included. Neither of these modes provides function definitions with
     19  * correct attributes, so we use preprocessor to force the headers
     20  * into a form that Clang can use.
     21  *
     22  * Similarly to NVCC which -include's cuda_runtime.h, Clang -include's
     23  * this file during every CUDA compilation.
     24  */
     25 
     26 #ifndef __CLANG_CUDA_RUNTIME_WRAPPER_H__
     27 #define __CLANG_CUDA_RUNTIME_WRAPPER_H__
     28 
     29 #if defined(__CUDA__) && defined(__clang__)
     30 
     31 // Include some forward declares that must come before cmath.
     32 #include <__clang_cuda_math_forward_declares.h>
     33 
     34 // Define __CUDACC__ early as libstdc++ standard headers with GNU extensions
     35 // enabled depend on it to avoid using __float128, which is unsupported in
     36 // CUDA.
     37 #define __CUDACC__
     38 
     39 // Include some standard headers to avoid CUDA headers including them
     40 // while some required macros (like __THROW) are in a weird state.
     41 #include <cmath>
     42 #include <cstdlib>
     43 #include <stdlib.h>
     44 #undef __CUDACC__
     45 
     46 // Preserve common macros that will be changed below by us or by CUDA
     47 // headers.
     48 #pragma push_macro("__THROW")
     49 #pragma push_macro("__CUDA_ARCH__")
     50 
     51 // WARNING: Preprocessor hacks below are based on specific details of
     52 // CUDA-7.x headers and are not expected to work with any other
     53 // version of CUDA headers.
     54 #include "cuda.h"
     55 #if !defined(CUDA_VERSION)
     56 #error "cuda.h did not define CUDA_VERSION"
     57 #elif CUDA_VERSION < 7000
     58 #error "Unsupported CUDA version!"
     59 #endif
     60 
     61 #pragma push_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
     62 #if CUDA_VERSION >= 10000
     63 #define __CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__
     64 #endif
     65 
     66 // Make largest subset of device functions available during host
     67 // compilation -- SM_35 for the time being.
     68 #ifndef __CUDA_ARCH__
     69 #define __CUDA_ARCH__ 350
     70 #endif
     71 
     72 #include "__clang_cuda_builtin_vars.h"
     73 
     74 // No need for device_launch_parameters.h as __clang_cuda_builtin_vars.h above
     75 // has taken care of builtin variables declared in the file.
     76 #define __DEVICE_LAUNCH_PARAMETERS_H__
     77 
     78 // {math,device}_functions.h only have declarations of the
     79 // functions. We don't need them as we're going to pull in their
     80 // definitions from .hpp files.
     81 #define __DEVICE_FUNCTIONS_H__
     82 #define __MATH_FUNCTIONS_H__
     83 #define __COMMON_FUNCTIONS_H__
     84 // device_functions_decls is replaced by __clang_cuda_device_functions.h
     85 // included below.
     86 #define __DEVICE_FUNCTIONS_DECLS_H__
     87 
     88 #undef __CUDACC__
     89 #if CUDA_VERSION < 9000
     90 #define __CUDABE__
     91 #else
     92 #define __CUDACC__
     93 #define __CUDA_LIBDEVICE__
     94 #endif
     95 // Disables definitions of device-side runtime support stubs in
     96 // cuda_device_runtime_api.h
     97 #include "host_defines.h"
     98 #undef __CUDACC__
     99 #include "driver_types.h"
    100 #include "host_config.h"
    101 
    102 // Temporarily replace "nv_weak" with weak, so __attribute__((nv_weak)) in
    103 // cuda_device_runtime_api.h ends up being __attribute__((weak)) which is the
    104 // functional equivalent of what we need.
    105 #pragma push_macro("nv_weak")
    106 #define nv_weak weak
    107 #undef __CUDABE__
    108 #undef __CUDA_LIBDEVICE__
    109 #define __CUDACC__
    110 #include "cuda_runtime.h"
    111 
    112 #pragma pop_macro("nv_weak")
    113 #undef __CUDACC__
    114 #define __CUDABE__
    115 
    116 // CUDA headers use __nvvm_memcpy and __nvvm_memset which Clang does
    117 // not have at the moment. Emulate them with a builtin memcpy/memset.
    118 #define __nvvm_memcpy(s, d, n, a) __builtin_memcpy(s, d, n)
    119 #define __nvvm_memset(d, c, n, a) __builtin_memset(d, c, n)
    120 
    121 #if CUDA_VERSION < 9000
    122 #include "crt/device_runtime.h"
    123 #endif
    124 #include "crt/host_runtime.h"
    125 // device_runtime.h defines __cxa_* macros that will conflict with
    126 // cxxabi.h.
    127 // FIXME: redefine these as __device__ functions.
    128 #undef __cxa_vec_ctor
    129 #undef __cxa_vec_cctor
    130 #undef __cxa_vec_dtor
    131 #undef __cxa_vec_new
    132 #undef __cxa_vec_new2
    133 #undef __cxa_vec_new3
    134 #undef __cxa_vec_delete2
    135 #undef __cxa_vec_delete
    136 #undef __cxa_vec_delete3
    137 #undef __cxa_pure_virtual
    138 
    139 // math_functions.hpp expects this host function be defined on MacOS, but it
    140 // ends up not being there because of the games we play here.  Just define it
    141 // ourselves; it's simple enough.
    142 #ifdef __APPLE__
    143 inline __host__ double __signbitd(double x) {
    144   return std::signbit(x);
    145 }
    146 #endif
    147 
    148 // CUDA 9.1 no longer provides declarations for libdevice functions, so we need
    149 // to provide our own.
    150 #include <__clang_cuda_libdevice_declares.h>
    151 
    152 // Wrappers for many device-side standard library functions, incl. math
    153 // functions, became compiler builtins in CUDA-9 and have been removed from the
    154 // CUDA headers. Clang now provides its own implementation of the wrappers.
    155 #if CUDA_VERSION >= 9000
    156 #include <__clang_cuda_device_functions.h>
    157 #include <__clang_cuda_math.h>
    158 #endif
    159 
    160 // __THROW is redefined to be empty by device_functions_decls.h in CUDA. Clang's
    161 // counterpart does not do it, so we need to make it empty here to keep
    162 // following CUDA includes happy.
    163 #undef __THROW
    164 #define __THROW
    165 
    166 // CUDA 8.0.41 relies on __USE_FAST_MATH__ and __CUDA_PREC_DIV's values.
    167 // Previous versions used to check whether they are defined or not.
    168 // CU_DEVICE_INVALID macro is only defined in 8.0.41, so we use it
    169 // here to detect the switch.
    170 
    171 #if defined(CU_DEVICE_INVALID)
    172 #if !defined(__USE_FAST_MATH__)
    173 #define __USE_FAST_MATH__ 0
    174 #endif
    175 
    176 #if !defined(__CUDA_PREC_DIV)
    177 #define __CUDA_PREC_DIV 0
    178 #endif
    179 #endif
    180 
    181 // Temporarily poison __host__ macro to ensure it's not used by any of
    182 // the headers we're about to include.
    183 #pragma push_macro("__host__")
    184 #define __host__ UNEXPECTED_HOST_ATTRIBUTE
    185 
    186 // device_functions.hpp and math_functions*.hpp use 'static
    187 // __forceinline__' (with no __device__) for definitions of device
    188 // functions. Temporarily redefine __forceinline__ to include
    189 // __device__.
    190 #pragma push_macro("__forceinline__")
    191 #define __forceinline__ __device__ __inline__ __attribute__((always_inline))
    192 #if CUDA_VERSION < 9000
    193 #include "device_functions.hpp"
    194 #endif
    195 
    196 // math_function.hpp uses the __USE_FAST_MATH__ macro to determine whether we
    197 // get the slow-but-accurate or fast-but-inaccurate versions of functions like
    198 // sin and exp.  This is controlled in clang by -fcuda-approx-transcendentals.
    199 //
    200 // device_functions.hpp uses __USE_FAST_MATH__ for a different purpose (fast vs.
    201 // slow divides), so we need to scope our define carefully here.
    202 #pragma push_macro("__USE_FAST_MATH__")
    203 #if defined(__CLANG_CUDA_APPROX_TRANSCENDENTALS__)
    204 #define __USE_FAST_MATH__ 1
    205 #endif
    206 
    207 #if CUDA_VERSION >= 9000
    208 // CUDA-9.2 needs host-side memcpy for some host functions in
    209 // device_functions.hpp
    210 #if CUDA_VERSION >= 9020
    211 #include <string.h>
    212 #endif
    213 #include "crt/math_functions.hpp"
    214 #else
    215 #include "math_functions.hpp"
    216 #endif
    217 
    218 #pragma pop_macro("__USE_FAST_MATH__")
    219 
    220 #if CUDA_VERSION < 9000
    221 #include "math_functions_dbl_ptx3.hpp"
    222 #endif
    223 #pragma pop_macro("__forceinline__")
    224 
    225 // Pull in host-only functions that are only available when neither
    226 // __CUDACC__ nor __CUDABE__ are defined.
    227 #undef __MATH_FUNCTIONS_HPP__
    228 #undef __CUDABE__
    229 #if CUDA_VERSION < 9000
    230 #include "math_functions.hpp"
    231 #endif
    232 // Alas, additional overloads for these functions are hard to get to.
    233 // Considering that we only need these overloads for a few functions,
    234 // we can provide them here.
    235 static inline float rsqrt(float __a) { return rsqrtf(__a); }
    236 static inline float rcbrt(float __a) { return rcbrtf(__a); }
    237 static inline float sinpi(float __a) { return sinpif(__a); }
    238 static inline float cospi(float __a) { return cospif(__a); }
    239 static inline void sincospi(float __a, float *__b, float *__c) {
    240   return sincospif(__a, __b, __c);
    241 }
    242 static inline float erfcinv(float __a) { return erfcinvf(__a); }
    243 static inline float normcdfinv(float __a) { return normcdfinvf(__a); }
    244 static inline float normcdf(float __a) { return normcdff(__a); }
    245 static inline float erfcx(float __a) { return erfcxf(__a); }
    246 
    247 #if CUDA_VERSION < 9000
    248 // For some reason single-argument variant is not always declared by
    249 // CUDA headers. Alas, device_functions.hpp included below needs it.
    250 static inline __device__ void __brkpt(int __c) { __brkpt(); }
    251 #endif
    252 
    253 // Now include *.hpp with definitions of various GPU functions.  Alas,
    254 // a lot of thins get declared/defined with __host__ attribute which
    255 // we don't want and we have to define it out. We also have to include
    256 // {device,math}_functions.hpp again in order to extract the other
    257 // branch of #if/else inside.
    258 #define __host__
    259 #undef __CUDABE__
    260 #define __CUDACC__
    261 #if CUDA_VERSION >= 9000
    262 // Some atomic functions became compiler builtins in CUDA-9 , so we need their
    263 // declarations.
    264 #include "device_atomic_functions.h"
    265 #endif
    266 #undef __DEVICE_FUNCTIONS_HPP__
    267 #include "device_atomic_functions.hpp"
    268 #if CUDA_VERSION >= 9000
    269 #include "crt/device_functions.hpp"
    270 #include "crt/device_double_functions.hpp"
    271 #else
    272 #include "device_functions.hpp"
    273 #define __CUDABE__
    274 #include "device_double_functions.h"
    275 #undef __CUDABE__
    276 #endif
    277 #include "sm_20_atomic_functions.hpp"
    278 #include "sm_20_intrinsics.hpp"
    279 #include "sm_32_atomic_functions.hpp"
    280 
    281 // Don't include sm_30_intrinsics.h and sm_32_intrinsics.h.  These define the
    282 // __shfl and __ldg intrinsics using inline (volatile) asm, but we want to
    283 // define them using builtins so that the optimizer can reason about and across
    284 // these instructions.  In particular, using intrinsics for ldg gets us the
    285 // [addr+imm] addressing mode, which, although it doesn't actually exist in the
    286 // hardware, seems to generate faster machine code because ptxas can more easily
    287 // reason about our code.
    288 
    289 #if CUDA_VERSION >= 8000
    290 #pragma push_macro("__CUDA_ARCH__")
    291 #undef __CUDA_ARCH__
    292 #include "sm_60_atomic_functions.hpp"
    293 #include "sm_61_intrinsics.hpp"
    294 #pragma pop_macro("__CUDA_ARCH__")
    295 #endif
    296 
    297 #undef __MATH_FUNCTIONS_HPP__
    298 
    299 // math_functions.hpp defines ::signbit as a __host__ __device__ function.  This
    300 // conflicts with libstdc++'s constexpr ::signbit, so we have to rename
    301 // math_function.hpp's ::signbit.  It's guarded by #undef signbit, but that's
    302 // conditional on __GNUC__.  :)
    303 #pragma push_macro("signbit")
    304 #pragma push_macro("__GNUC__")
    305 #undef __GNUC__
    306 #define signbit __ignored_cuda_signbit
    307 
    308 // CUDA-9 omits device-side definitions of some math functions if it sees
    309 // include guard from math.h wrapper from libstdc++. We have to undo the header
    310 // guard temporarily to get the definitions we need.
    311 #pragma push_macro("_GLIBCXX_MATH_H")
    312 #pragma push_macro("_LIBCPP_VERSION")
    313 #if CUDA_VERSION >= 9000
    314 #undef _GLIBCXX_MATH_H
    315 // We also need to undo another guard that checks for libc++ 3.8+
    316 #ifdef _LIBCPP_VERSION
    317 #define _LIBCPP_VERSION 3700
    318 #endif
    319 #endif
    320 
    321 #if CUDA_VERSION >= 9000
    322 #include "crt/math_functions.hpp"
    323 #else
    324 #include "math_functions.hpp"
    325 #endif
    326 #pragma pop_macro("_GLIBCXX_MATH_H")
    327 #pragma pop_macro("_LIBCPP_VERSION")
    328 #pragma pop_macro("__GNUC__")
    329 #pragma pop_macro("signbit")
    330 
    331 #pragma pop_macro("__host__")
    332 
    333 #include "texture_indirect_functions.h"
    334 
    335 // Restore state of __CUDA_ARCH__ and __THROW we had on entry.
    336 #pragma pop_macro("__CUDA_ARCH__")
    337 #pragma pop_macro("__THROW")
    338 
    339 // Set up compiler macros expected to be seen during compilation.
    340 #undef __CUDABE__
    341 #define __CUDACC__
    342 
    343 extern "C" {
    344 // Device-side CUDA system calls.
    345 // http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
    346 // We need these declarations and wrappers for device-side
    347 // malloc/free/printf calls to work without relying on
    348 // -fcuda-disable-target-call-checks option.
    349 __device__ int vprintf(const char *, const char *);
    350 __device__ void free(void *) __attribute((nothrow));
    351 __device__ void *malloc(size_t) __attribute((nothrow)) __attribute__((malloc));
    352 
    353 // __assertfail() used to have a `noreturn` attribute. Unfortunately that
    354 // contributed to triggering the longstanding bug in ptxas when assert was used
    355 // in sufficiently convoluted code. See
    356 // https://bugs.llvm.org/show_bug.cgi?id=27738 for the details.
    357 __device__ void __assertfail(const char *__message, const char *__file,
    358                              unsigned __line, const char *__function,
    359                              size_t __charSize);
    360 
    361 // In order for standard assert() macro on linux to work we need to
    362 // provide device-side __assert_fail()
    363 __device__ static inline void __assert_fail(const char *__message,
    364                                             const char *__file, unsigned __line,
    365                                             const char *__function) {
    366   __assertfail(__message, __file, __line, __function, sizeof(char));
    367 }
    368 
    369 // Clang will convert printf into vprintf, but we still need
    370 // device-side declaration for it.
    371 __device__ int printf(const char *, ...);
    372 } // extern "C"
    373 
    374 // We also need device-side std::malloc and std::free.
    375 namespace std {
    376 __device__ static inline void free(void *__ptr) { ::free(__ptr); }
    377 __device__ static inline void *malloc(size_t __size) {
    378   return ::malloc(__size);
    379 }
    380 } // namespace std
    381 
    382 // Out-of-line implementations from __clang_cuda_builtin_vars.h.  These need to
    383 // come after we've pulled in the definition of uint3 and dim3.
    384 
    385 __device__ inline __cuda_builtin_threadIdx_t::operator dim3() const {
    386   return dim3(x, y, z);
    387 }
    388 
    389 __device__ inline __cuda_builtin_threadIdx_t::operator uint3() const {
    390   return {x, y, z};
    391 }
    392 
    393 __device__ inline __cuda_builtin_blockIdx_t::operator dim3() const {
    394   return dim3(x, y, z);
    395 }
    396 
    397 __device__ inline __cuda_builtin_blockIdx_t::operator uint3() const {
    398   return {x, y, z};
    399 }
    400 
    401 __device__ inline __cuda_builtin_blockDim_t::operator dim3() const {
    402   return dim3(x, y, z);
    403 }
    404 
    405 __device__ inline __cuda_builtin_blockDim_t::operator uint3() const {
    406   return {x, y, z};
    407 }
    408 
    409 __device__ inline __cuda_builtin_gridDim_t::operator dim3() const {
    410   return dim3(x, y, z);
    411 }
    412 
    413 __device__ inline __cuda_builtin_gridDim_t::operator uint3() const {
    414   return {x, y, z};
    415 }
    416 
    417 #include <__clang_cuda_cmath.h>
    418 #include <__clang_cuda_intrinsics.h>
    419 #include <__clang_cuda_complex_builtins.h>
    420 
    421 // curand_mtgp32_kernel helpfully redeclares blockDim and threadIdx in host
    422 // mode, giving them their "proper" types of dim3 and uint3.  This is
    423 // incompatible with the types we give in __clang_cuda_builtin_vars.h.  As as
    424 // hack, force-include the header (nvcc doesn't include it by default) but
    425 // redefine dim3 and uint3 to our builtin types.  (Thankfully dim3 and uint3 are
    426 // only used here for the redeclarations of blockDim and threadIdx.)
    427 #pragma push_macro("dim3")
    428 #pragma push_macro("uint3")
    429 #define dim3 __cuda_builtin_blockDim_t
    430 #define uint3 __cuda_builtin_threadIdx_t
    431 #include "curand_mtgp32_kernel.h"
    432 #pragma pop_macro("dim3")
    433 #pragma pop_macro("uint3")
    434 #pragma pop_macro("__USE_FAST_MATH__")
    435 #pragma pop_macro("__CUDA_INCLUDE_COMPILER_INTERNAL_HEADERS__")
    436 
    437 // CUDA runtime uses this undocumented function to access kernel launch
    438 // configuration. The declaration is in crt/device_functions.h but that file
    439 // includes a lot of other stuff we don't want. Instead, we'll provide our own
    440 // declaration for it here.
    441 #if CUDA_VERSION >= 9020
    442 extern "C" unsigned __cudaPushCallConfiguration(dim3 gridDim, dim3 blockDim,
    443                                                 size_t sharedMem = 0,
    444                                                 void *stream = 0);
    445 #endif
    446 
    447 #endif // __CUDA__
    448 #endif // __CLANG_CUDA_RUNTIME_WRAPPER_H__
    449