Skip to content

Commit 80525df

Browse files
authored
[Offload][CUDA] Allow CUDA kernels to use LLVM/Offload (#94549)
Through the new `-foffload-via-llvm` flag, CUDA kernels can now be lowered to the LLVM/Offload API. On the Clang side, this is simply done by using the OpenMP offload toolchain and emitting calls to `llvm*` functions to orchestrate the kernel launch rather than `cuda*` functions. These `llvm*` functions are implemented on top of the existing LLVM/Offload API. As we are about to redefine the Offload API, this wil help us in the design process as a second offload language. We do not support any CUDA APIs yet, however, we could: https://www.osti.gov/servlets/purl/1892137 For proper host execution we need to resurrect/rebase https://tianshilei.me/wp-content/uploads/2021/12/llpp-2021.pdf (which was designed for debugging). ``` ❯❯❯ cat test.cu extern "C" { void *llvm_omp_target_alloc_shared(size_t Size, int DeviceNum); void llvm_omp_target_free_shared(void *DevicePtr, int DeviceNum); } __global__ void square(int *A) { *A = 42; } int main(int argc, char **argv) { int DevNo = 0; int *Ptr = reinterpret_cast<int *>(llvm_omp_target_alloc_shared(4, DevNo)); *Ptr = 7; printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); square<<<1, 1>>>(Ptr); printf("Ptr %p, *Ptr %i\n", Ptr, *Ptr); llvm_omp_target_free_shared(Ptr, DevNo); } ❯❯❯ clang++ test.cu -O3 -o test123 -foffload-via-llvm --offload-arch=native ❯❯❯ llvm-objdump --offloading test123 test123: file format elf64-x86-64 OFFLOADING IMAGE [0]: kind elf arch gfx90a triple amdgcn-amd-amdhsa producer openmp ❯❯❯ LIBOMPTARGET_INFO=16 ./test123 Ptr 0x155448ac8000, *Ptr 7 Ptr 0x155448ac8000, *Ptr 42 ```
1 parent 887f700 commit 80525df

29 files changed

+577
-54
lines changed

clang/include/clang/Basic/LangOptions.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,7 @@ LANGOPT(GPUMaxThreadsPerBlock, 32, 1024, "default max threads per block for kern
298298
LANGOPT(GPUDeferDiag, 1, 0, "defer host/device related diagnostic messages for CUDA/HIP")
299299
LANGOPT(GPUExcludeWrongSideOverloads, 1, 0, "always exclude wrong side overloads in overloading resolution for CUDA/HIP")
300300
LANGOPT(OffloadingNewDriver, 1, 0, "use the new driver for generating offloading code.")
301+
LANGOPT(OffloadViaLLVM, 1, 0, "target LLVM/Offload as portable offloading runtime.")
301302

302303
LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device")
303304
LANGOPT(SYCLIsHost , 1, 0, "SYCL host compilation")

clang/include/clang/Driver/Options.td

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1302,6 +1302,12 @@ def no_offload_compress : Flag<["--"], "no-offload-compress">;
13021302
def offload_compression_level_EQ : Joined<["--"], "offload-compression-level=">,
13031303
Flags<[HelpHidden]>,
13041304
HelpText<"Compression level for offload device binaries (HIP only)">;
1305+
1306+
defm offload_via_llvm : BoolFOption<"offload-via-llvm",
1307+
LangOpts<"OffloadViaLLVM">, DefaultFalse,
1308+
PosFlag<SetTrue, [], [ClangOption, CC1Option], "Use">,
1309+
NegFlag<SetFalse, [], [ClangOption], "Don't use">,
1310+
BothFlags<[], [ClangOption], " LLVM/Offload as portable offloading runtime.">>;
13051311
}
13061312

13071313
// CUDA options

clang/lib/CodeGen/CGCUDANV.cpp

Lines changed: 82 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -15,10 +15,12 @@
1515
#include "CGCXXABI.h"
1616
#include "CodeGenFunction.h"
1717
#include "CodeGenModule.h"
18+
#include "clang/AST/CharUnits.h"
1819
#include "clang/AST/Decl.h"
1920
#include "clang/Basic/Cuda.h"
2021
#include "clang/CodeGen/CodeGenABITypes.h"
2122
#include "clang/CodeGen/ConstantInitBuilder.h"
23+
#include "llvm/ADT/StringRef.h"
2224
#include "llvm/Frontend/Offloading/Utility.h"
2325
#include "llvm/IR/BasicBlock.h"
2426
#include "llvm/IR/Constants.h"
@@ -36,6 +38,11 @@ constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF"
3638

3739
class CGNVCUDARuntime : public CGCUDARuntime {
3840

41+
/// The prefix used for function calls and section names (CUDA, HIP, LLVM)
42+
StringRef Prefix;
43+
/// TODO: We should transition the OpenMP section to LLVM/Offload
44+
StringRef SectionPrefix;
45+
3946
private:
4047
llvm::IntegerType *IntTy, *SizeTy;
4148
llvm::Type *VoidTy;
@@ -132,6 +139,9 @@ class CGNVCUDARuntime : public CGCUDARuntime {
132139
return DummyFunc;
133140
}
134141

142+
Address prepareKernelArgs(CodeGenFunction &CGF, FunctionArgList &Args);
143+
Address prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
144+
FunctionArgList &Args);
135145
void emitDeviceStubBodyLegacy(CodeGenFunction &CGF, FunctionArgList &Args);
136146
void emitDeviceStubBodyNew(CodeGenFunction &CGF, FunctionArgList &Args);
137147
std::string getDeviceSideName(const NamedDecl *ND) override;
@@ -191,15 +201,11 @@ class CGNVCUDARuntime : public CGCUDARuntime {
191201
} // end anonymous namespace
192202

193203
std::string CGNVCUDARuntime::addPrefixToName(StringRef FuncName) const {
194-
if (CGM.getLangOpts().HIP)
195-
return ((Twine("hip") + Twine(FuncName)).str());
196-
return ((Twine("cuda") + Twine(FuncName)).str());
204+
return (Prefix + FuncName).str();
197205
}
198206
std::string
199207
CGNVCUDARuntime::addUnderscoredPrefixToName(StringRef FuncName) const {
200-
if (CGM.getLangOpts().HIP)
201-
return ((Twine("__hip") + Twine(FuncName)).str());
202-
return ((Twine("__cuda") + Twine(FuncName)).str());
208+
return ("__" + Prefix + FuncName).str();
203209
}
204210

205211
static std::unique_ptr<MangleContext> InitDeviceMC(CodeGenModule &CGM) {
@@ -227,6 +233,14 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM)
227233
SizeTy = CGM.SizeTy;
228234
VoidTy = CGM.VoidTy;
229235
PtrTy = CGM.UnqualPtrTy;
236+
237+
if (CGM.getLangOpts().OffloadViaLLVM) {
238+
Prefix = "llvm";
239+
SectionPrefix = "omp";
240+
} else if (CGM.getLangOpts().HIP)
241+
SectionPrefix = Prefix = "hip";
242+
else
243+
SectionPrefix = Prefix = "cuda";
230244
}
231245

232246
llvm::FunctionCallee CGNVCUDARuntime::getSetupArgumentFn() const {
@@ -305,18 +319,58 @@ void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF,
305319
}
306320
if (CudaFeatureEnabled(CGM.getTarget().getSDKVersion(),
307321
CudaFeature::CUDA_USES_NEW_LAUNCH) ||
308-
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI))
322+
(CGF.getLangOpts().HIP && CGF.getLangOpts().HIPUseNewLaunchAPI) ||
323+
(CGF.getLangOpts().OffloadViaLLVM))
309324
emitDeviceStubBodyNew(CGF, Args);
310325
else
311326
emitDeviceStubBodyLegacy(CGF, Args);
312327
}
313328

314-
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
315-
// array and kernels are launched using cudaLaunchKernel().
316-
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
317-
FunctionArgList &Args) {
318-
// Build the shadow stack entry at the very start of the function.
329+
/// CUDA passes the arguments with a level of indirection. For example, a
330+
/// (void*, short, void*) is passed as {void **, short *, void **} to the launch
331+
/// function. For the LLVM/offload launch we flatten the arguments into the
332+
/// struct directly. In addition, we include the size of the arguments, thus
333+
/// pass {sizeof({void *, short, void *}), ptr to {void *, short, void *},
334+
/// nullptr}. The last nullptr needs to be initialized to an array of pointers
335+
/// pointing to the arguments if we want to offload to the host.
336+
Address CGNVCUDARuntime::prepareKernelArgsLLVMOffload(CodeGenFunction &CGF,
337+
FunctionArgList &Args) {
338+
SmallVector<llvm::Type *> ArgTypes, KernelLaunchParamsTypes;
339+
for (auto &Arg : Args)
340+
ArgTypes.push_back(CGF.ConvertTypeForMem(Arg->getType()));
341+
llvm::StructType *KernelArgsTy = llvm::StructType::create(ArgTypes);
342+
343+
auto *Int64Ty = CGF.Builder.getInt64Ty();
344+
KernelLaunchParamsTypes.push_back(Int64Ty);
345+
KernelLaunchParamsTypes.push_back(PtrTy);
346+
KernelLaunchParamsTypes.push_back(PtrTy);
347+
348+
llvm::StructType *KernelLaunchParamsTy =
349+
llvm::StructType::create(KernelLaunchParamsTypes);
350+
Address KernelArgs = CGF.CreateTempAllocaWithoutCast(
351+
KernelArgsTy, CharUnits::fromQuantity(16), "kernel_args");
352+
Address KernelLaunchParams = CGF.CreateTempAllocaWithoutCast(
353+
KernelLaunchParamsTy, CharUnits::fromQuantity(16),
354+
"kernel_launch_params");
355+
356+
auto KernelArgsSize = CGM.getDataLayout().getTypeAllocSize(KernelArgsTy);
357+
CGF.Builder.CreateStore(llvm::ConstantInt::get(Int64Ty, KernelArgsSize),
358+
CGF.Builder.CreateStructGEP(KernelLaunchParams, 0));
359+
CGF.Builder.CreateStore(KernelArgs.emitRawPointer(CGF),
360+
CGF.Builder.CreateStructGEP(KernelLaunchParams, 1));
361+
CGF.Builder.CreateStore(llvm::Constant::getNullValue(PtrTy),
362+
CGF.Builder.CreateStructGEP(KernelLaunchParams, 2));
363+
364+
for (unsigned i = 0; i < Args.size(); ++i) {
365+
auto *ArgVal = CGF.Builder.CreateLoad(CGF.GetAddrOfLocalVar(Args[i]));
366+
CGF.Builder.CreateStore(ArgVal, CGF.Builder.CreateStructGEP(KernelArgs, i));
367+
}
319368

369+
return KernelLaunchParams;
370+
}
371+
372+
Address CGNVCUDARuntime::prepareKernelArgs(CodeGenFunction &CGF,
373+
FunctionArgList &Args) {
320374
// Calculate amount of space we will need for all arguments. If we have no
321375
// args, allocate a single pointer so we still have a valid pointer to the
322376
// argument array that we can pass to runtime, even if it will be unused.
@@ -331,6 +385,17 @@ void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
331385
VoidVarPtr, CGF.Builder.CreateConstGEP1_32(
332386
PtrTy, KernelArgs.emitRawPointer(CGF), i));
333387
}
388+
return KernelArgs;
389+
}
390+
391+
// CUDA 9.0+ uses new way to launch kernels. Parameters are packed in a local
392+
// array and kernels are launched using cudaLaunchKernel().
393+
void CGNVCUDARuntime::emitDeviceStubBodyNew(CodeGenFunction &CGF,
394+
FunctionArgList &Args) {
395+
// Build the shadow stack entry at the very start of the function.
396+
Address KernelArgs = CGF.getLangOpts().OffloadViaLLVM
397+
? prepareKernelArgsLLVMOffload(CGF, Args)
398+
: prepareKernelArgs(CGF, Args);
334399

335400
llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end");
336401

@@ -1129,8 +1194,9 @@ void CGNVCUDARuntime::transformManagedVars() {
11291194
// registered. The linker will provide a pointer to this section so we can
11301195
// register the symbols with the linked device image.
11311196
void CGNVCUDARuntime::createOffloadingEntries() {
1132-
StringRef Section = CGM.getLangOpts().HIP ? "hip_offloading_entries"
1133-
: "cuda_offloading_entries";
1197+
SmallVector<char, 32> Out;
1198+
StringRef Section = (SectionPrefix + "_offloading_entries").toStringRef(Out);
1199+
11341200
llvm::Module &M = CGM.getModule();
11351201
for (KernelInfo &I : EmittedKernels)
11361202
llvm::offloading::emitOffloadingEntry(
@@ -1199,7 +1265,8 @@ llvm::Function *CGNVCUDARuntime::finalizeModule() {
11991265
}
12001266
return nullptr;
12011267
}
1202-
if (CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode)
1268+
if (CGM.getLangOpts().OffloadViaLLVM ||
1269+
(CGM.getLangOpts().OffloadingNewDriver && RelocatableDeviceCode))
12031270
createOffloadingEntries();
12041271
else
12051272
return makeModuleCtorFunction();

clang/lib/CodeGen/CodeGenFunction.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1013,7 +1013,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy,
10131013
}
10141014

10151015
if (FD && (getLangOpts().OpenCL ||
1016-
(getLangOpts().HIP && getLangOpts().CUDAIsDevice))) {
1016+
((getLangOpts().HIP || getLangOpts().OffloadViaLLVM) &&
1017+
getLangOpts().CUDAIsDevice))) {
10171018
// Add metadata for a kernel function.
10181019
EmitKernelMetadata(FD, Fn);
10191020
}

clang/lib/Driver/Driver.cpp

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -786,11 +786,13 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
786786
}) ||
787787
C.getInputArgs().hasArg(options::OPT_hip_link) ||
788788
C.getInputArgs().hasArg(options::OPT_hipstdpar);
789+
bool UseLLVMOffload = C.getInputArgs().hasArg(
790+
options::OPT_foffload_via_llvm, options::OPT_fno_offload_via_llvm, false);
789791
if (IsCuda && IsHIP) {
790792
Diag(clang::diag::err_drv_mix_cuda_hip);
791793
return;
792794
}
793-
if (IsCuda) {
795+
if (IsCuda && !UseLLVMOffload) {
794796
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
795797
const llvm::Triple &HostTriple = HostTC->getTriple();
796798
auto OFK = Action::OFK_Cuda;
@@ -812,7 +814,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
812814
CudaInstallation.WarnIfUnsupportedVersion();
813815
}
814816
C.addOffloadDeviceToolChain(CudaTC.get(), OFK);
815-
} else if (IsHIP) {
817+
} else if (IsHIP && !UseLLVMOffload) {
816818
if (auto *OMPTargetArg =
817819
C.getInputArgs().getLastArg(options::OPT_fopenmp_targets_EQ)) {
818820
Diag(clang::diag::err_drv_unsupported_opt_for_language_mode)
@@ -836,10 +838,11 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
836838
// We need to generate an OpenMP toolchain if the user specified targets with
837839
// the -fopenmp-targets option or used --offload-arch with OpenMP enabled.
838840
bool IsOpenMPOffloading =
839-
C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
840-
options::OPT_fno_openmp, false) &&
841-
(C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
842-
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ));
841+
((IsCuda || IsHIP) && UseLLVMOffload) ||
842+
(C.getInputArgs().hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
843+
options::OPT_fno_openmp, false) &&
844+
(C.getInputArgs().hasArg(options::OPT_fopenmp_targets_EQ) ||
845+
C.getInputArgs().hasArg(options::OPT_offload_arch_EQ)));
843846
if (IsOpenMPOffloading) {
844847
// We expect that -fopenmp-targets is always used in conjunction with the
845848
// option -fopenmp specifying a valid runtime with offloading support, i.e.
@@ -867,7 +870,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
867870
for (StringRef T : OpenMPTargets->getValues())
868871
OpenMPTriples.insert(T);
869872
} else if (C.getInputArgs().hasArg(options::OPT_offload_arch_EQ) &&
870-
!IsHIP && !IsCuda) {
873+
((!IsHIP && !IsCuda) || UseLLVMOffload)) {
871874
const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
872875
auto AMDTriple = getHIPOffloadTargetTriple(*this, C.getInputArgs());
873876
auto NVPTXTriple = getNVIDIAOffloadTargetTriple(*this, C.getInputArgs(),
@@ -4152,6 +4155,8 @@ void Driver::BuildActions(Compilation &C, DerivedArgList &Args,
41524155

41534156
bool UseNewOffloadingDriver =
41544157
C.isOffloadingHostKind(Action::OFK_OpenMP) ||
4158+
Args.hasFlag(options::OPT_foffload_via_llvm,
4159+
options::OPT_fno_offload_via_llvm, false) ||
41554160
Args.hasFlag(options::OPT_offload_new_driver,
41564161
options::OPT_no_offload_new_driver, false);
41574162

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 27 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1095,6 +1095,18 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
10951095
CmdArgs.push_back("__clang_openmp_device_functions.h");
10961096
}
10971097

1098+
if (Args.hasArg(options::OPT_foffload_via_llvm)) {
1099+
// Add llvm_wrappers/* to our system include path. This lets us wrap
1100+
// standard library headers and other headers.
1101+
SmallString<128> P(D.ResourceDir);
1102+
llvm::sys::path::append(P, "include", "llvm_offload_wrappers");
1103+
CmdArgs.append({"-internal-isystem", Args.MakeArgString(P), "-include"});
1104+
if (JA.isDeviceOffloading(Action::OFK_OpenMP))
1105+
CmdArgs.push_back("__llvm_offload_device.h");
1106+
else
1107+
CmdArgs.push_back("__llvm_offload_host.h");
1108+
}
1109+
10981110
// Add -i* options, and automatically translate to
10991111
// -include-pch/-include-pth for transparent PCH support. It's
11001112
// wonky, but we include looking for .gch so we can support seamless
@@ -6665,6 +6677,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
66656677
// device offloading action other than OpenMP.
66666678
if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
66676679
options::OPT_fno_openmp, false) &&
6680+
!Args.hasFlag(options::OPT_foffload_via_llvm,
6681+
options::OPT_fno_offload_via_llvm, false) &&
66686682
(JA.isDeviceOffloading(Action::OFK_None) ||
66696683
JA.isDeviceOffloading(Action::OFK_OpenMP))) {
66706684
switch (D.getOpenMPRuntime(Args)) {
@@ -6742,11 +6756,16 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
67426756
Args.addOptOutFlag(CmdArgs, options::OPT_fopenmp_extensions,
67436757
options::OPT_fno_openmp_extensions);
67446758
}
6745-
6746-
// Forward the new driver to change offloading code generation.
6747-
if (Args.hasFlag(options::OPT_offload_new_driver,
6748-
options::OPT_no_offload_new_driver, false))
6759+
// Forward the offload runtime change to code generation, liboffload implies
6760+
// new driver. Otherwise, check if we should forward the new driver to change
6761+
// offloading code generation.
6762+
if (Args.hasFlag(options::OPT_foffload_via_llvm,
6763+
options::OPT_fno_offload_via_llvm, false)) {
6764+
CmdArgs.append({"--offload-new-driver", "-foffload-via-llvm"});
6765+
} else if (Args.hasFlag(options::OPT_offload_new_driver,
6766+
options::OPT_no_offload_new_driver, false)) {
67496767
CmdArgs.push_back("--offload-new-driver");
6768+
}
67506769

67516770
SanitizeArgs.addArgs(TC, Args, CmdArgs, InputType);
67526771

@@ -7778,6 +7797,10 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
77787797
// so that only the relevant declarations are emitted.
77797798
if (IsOpenMPDevice) {
77807799
CmdArgs.push_back("-fopenmp-is-target-device");
7800+
// If we are offloading cuda/hip via llvm, it's also "cuda device code".
7801+
if (Args.hasArg(options::OPT_foffload_via_llvm))
7802+
CmdArgs.push_back("-fcuda-is-device");
7803+
77817804
if (OpenMPDeviceInput) {
77827805
CmdArgs.push_back("-fopenmp-host-ir-file-path");
77837806
CmdArgs.push_back(Args.MakeArgString(OpenMPDeviceInput->getFilename()));

clang/lib/Driver/ToolChains/CommonArgs.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1199,8 +1199,13 @@ bool tools::addOpenMPRuntime(const Compilation &C, ArgStringList &CmdArgs,
11991199
bool ForceStaticHostRuntime, bool IsOffloadingHost,
12001200
bool GompNeedsRT) {
12011201
if (!Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
1202-
options::OPT_fno_openmp, false))
1202+
options::OPT_fno_openmp, false)) {
1203+
// We need libomptarget (liboffload) if it's the choosen offloading runtime.
1204+
if (Args.hasFlag(options::OPT_foffload_via_llvm,
1205+
options::OPT_fno_offload_via_llvm, false))
1206+
CmdArgs.push_back("-lomptarget");
12031207
return false;
1208+
}
12041209

12051210
Driver::OpenMPRuntimeKind RTKind = TC.getDriver().getOpenMPRuntime(Args);
12061211

clang/lib/Driver/ToolChains/Cuda.cpp

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -839,17 +839,15 @@ void CudaToolChain::addClangTargetOptions(
839839
DeviceOffloadingKind == Action::OFK_Cuda) &&
840840
"Only OpenMP or CUDA offloading kinds are supported for NVIDIA GPUs.");
841841

842-
if (DeviceOffloadingKind == Action::OFK_Cuda) {
843-
CC1Args.append(
844-
{"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
845-
846-
// Unsized function arguments used for variadics were introduced in CUDA-9.0
847-
// We still do not support generating code that actually uses variadic
848-
// arguments yet, but we do need to allow parsing them as recent CUDA
849-
// headers rely on that. https://github.com/llvm/llvm-project/issues/58410
850-
if (CudaInstallation.version() >= CudaVersion::CUDA_90)
851-
CC1Args.push_back("-fcuda-allow-variadic-functions");
852-
}
842+
CC1Args.append(
843+
{"-fcuda-is-device", "-mllvm", "-enable-memcpyopt-without-libcalls"});
844+
845+
// Unsized function arguments used for variadics were introduced in CUDA-9.0
846+
// We still do not support generating code that actually uses variadic
847+
// arguments yet, but we do need to allow parsing them as recent CUDA
848+
// headers rely on that. https://github.com/llvm/llvm-project/issues/58410
849+
if (CudaInstallation.version() >= CudaVersion::CUDA_90)
850+
CC1Args.push_back("-fcuda-allow-variadic-functions");
853851

854852
if (DriverArgs.hasArg(options::OPT_nogpulib))
855853
return;
@@ -867,6 +865,13 @@ void CudaToolChain::addClangTargetOptions(
867865
CC1Args.push_back("-mlink-builtin-bitcode");
868866
CC1Args.push_back(DriverArgs.MakeArgString(LibDeviceFile));
869867

868+
// For now, we don't use any Offload/OpenMP device runtime when we offload
869+
// CUDA via LLVM/Offload. We should split the Offload/OpenMP device runtime
870+
// and include the "generic" (or CUDA-specific) parts.
871+
if (DriverArgs.hasFlag(options::OPT_foffload_via_llvm,
872+
options::OPT_fno_offload_via_llvm, false))
873+
return;
874+
870875
clang::CudaVersion CudaInstallationVersion = CudaInstallation.version();
871876

872877
if (DriverArgs.hasFlag(options::OPT_fcuda_short_ptr,

0 commit comments

Comments
 (0)