Skip to content

Commit 254fd31

Browse files
committed
[Offload] Use flat array for cuLaunchKernel
We already used a flat array of kernel launch parameters for the AMD GPU launch but now we also use this scheme for the NVIDIA GPU launch. The only remaining/required use of the indirection is the host plugin (due ot ffi). This allows to us simplify the use for non-OpenMP kernel launch.
1 parent bd815a5 commit 254fd31

File tree

6 files changed

+68
-44
lines changed

6 files changed

+68
-44
lines changed

offload/include/Shared/APITypes.h

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,16 @@ static_assert(sizeof(KernelArgsTy) ==
112112
(8 * sizeof(int32_t) + 3 * sizeof(int64_t) +
113113
4 * sizeof(void **) + 2 * sizeof(int64_t *)),
114114
"Invalid struct size");
115+
116+
/// Flat array of kernel launch parameters and their total size.
117+
struct KernelLaunchParamsTy {
118+
/// Size of the Data array.
119+
size_t Size = 0;
120+
/// Flat array of kernel parameters.
121+
void *Data = nullptr;
122+
/// Ptrs to the Data entries. Only strictly required for the host plugin.
123+
void **Ptrs = nullptr;
124+
};
115125
}
116126

117127
#endif // OMPTARGET_SHARED_API_TYPES_H

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <unistd.h>
2121
#include <unordered_map>
2222

23+
#include "Shared/APITypes.h"
2324
#include "Shared/Debug.h"
2425
#include "Shared/Environment.h"
2526
#include "Shared/Utils.h"
@@ -558,7 +559,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
558559

559560
/// Launch the AMDGPU kernel function.
560561
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
561-
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
562+
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
563+
KernelLaunchParamsTy LaunchParams,
562564
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
563565

564566
/// Print more elaborate kernel launch info for AMDGPU
@@ -2802,9 +2804,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
28022804
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
28032805

28042806
KernelArgsTy KernelArgs = {};
2805-
if (auto Err = AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
2806-
/*NumBlocks=*/1ul, KernelArgs,
2807-
/*Args=*/nullptr, AsyncInfoWrapper))
2807+
if (auto Err =
2808+
AMDGPUKernel.launchImpl(*this, /*NumThread=*/1u,
2809+
/*NumBlocks=*/1ul, KernelArgs,
2810+
KernelLaunchParamsTy{}, AsyncInfoWrapper))
28082811
return Err;
28092812

28102813
Error Err = Plugin::success();
@@ -3266,18 +3269,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
32663269

32673270
Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
32683271
uint32_t NumThreads, uint64_t NumBlocks,
3269-
KernelArgsTy &KernelArgs, void *Args,
3272+
KernelArgsTy &KernelArgs,
3273+
KernelLaunchParamsTy LaunchParams,
32703274
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
3271-
const uint32_t KernelArgsSize = KernelArgs.NumArgs * sizeof(void *);
32723275

3273-
if (ArgsSize < KernelArgsSize)
3276+
if (ArgsSize < LaunchParams.Size)
32743277
return Plugin::error("Mismatch of kernel arguments size");
32753278

32763279
// The args size reported by HSA may or may not contain the implicit args.
32773280
// For now, assume that HSA does not consider the implicit arguments when
32783281
// reporting the arguments of a kernel. In the worst case, we can waste
32793282
// 56 bytes per allocation.
3280-
uint32_t AllArgsSize = KernelArgsSize + ImplicitArgsSize;
3283+
uint32_t AllArgsSize = LaunchParams.Size + ImplicitArgsSize;
32813284

32823285
AMDGPUPluginTy &AMDGPUPlugin =
32833286
static_cast<AMDGPUPluginTy &>(GenericDevice.Plugin);
@@ -3302,17 +3305,16 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
33023305
// Initialize implicit arguments.
33033306
utils::AMDGPUImplicitArgsTy *ImplArgs =
33043307
reinterpret_cast<utils::AMDGPUImplicitArgsTy *>(
3305-
advanceVoidPtr(AllArgs, KernelArgsSize));
3308+
advanceVoidPtr(AllArgs, LaunchParams.Size));
33063309

33073310
// Initialize the implicit arguments to zero.
33083311
std::memset(ImplArgs, 0, ImplicitArgsSize);
33093312

33103313
// Copy the explicit arguments.
33113314
// TODO: We should expose the args memory manager alloc to the common part as
33123315
// alternative to copying them twice.
3313-
if (KernelArgs.NumArgs)
3314-
std::memcpy(AllArgs, *static_cast<void **>(Args),
3315-
sizeof(void *) * KernelArgs.NumArgs);
3316+
if (LaunchParams.Size)
3317+
std::memcpy(AllArgs, LaunchParams.Data, LaunchParams.Size);
33163318

33173319
AMDGPUDeviceTy &AMDGPUDevice = static_cast<AMDGPUDeviceTy &>(GenericDevice);
33183320

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 8 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
#include <shared_mutex>
2020
#include <vector>
2121

22+
#include "Shared/APITypes.h"
2223
#include "Shared/Debug.h"
2324
#include "Shared/Environment.h"
2425
#include "Shared/EnvironmentVar.h"
@@ -265,7 +266,7 @@ struct GenericKernelTy {
265266
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
266267
virtual Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
267268
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
268-
void *Args,
269+
KernelLaunchParamsTy LaunchParams,
269270
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;
270271

271272
/// Get the kernel name.
@@ -326,11 +327,12 @@ struct GenericKernelTy {
326327

327328
private:
328329
/// Prepare the arguments before launching the kernel.
329-
void *prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
330-
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
331-
llvm::SmallVectorImpl<void *> &Args,
332-
llvm::SmallVectorImpl<void *> &Ptrs,
333-
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
330+
KernelLaunchParamsTy
331+
prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
332+
ptrdiff_t *ArgOffsets, uint32_t &NumArgs,
333+
llvm::SmallVectorImpl<void *> &Args,
334+
llvm::SmallVectorImpl<void *> &Ptrs,
335+
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) const;
334336

335337
/// Get the number of threads and blocks for the kernel based on the
336338
/// user-defined threads and block clauses.

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -268,9 +268,9 @@ struct RecordReplayTy {
268268
OS.close();
269269
}
270270

271-
void saveKernelDescr(const char *Name, void **ArgPtrs, int32_t NumArgs,
272-
uint64_t NumTeamsClause, uint32_t ThreadLimitClause,
273-
uint64_t LoopTripCount) {
271+
void saveKernelDescr(const char *Name, KernelLaunchParamsTy LaunchParams,
272+
int32_t NumArgs, uint64_t NumTeamsClause,
273+
uint32_t ThreadLimitClause, uint64_t LoopTripCount) {
274274
json::Object JsonKernelInfo;
275275
JsonKernelInfo["Name"] = Name;
276276
JsonKernelInfo["NumArgs"] = NumArgs;
@@ -283,7 +283,7 @@ struct RecordReplayTy {
283283

284284
json::Array JsonArgPtrs;
285285
for (int I = 0; I < NumArgs; ++I)
286-
JsonArgPtrs.push_back((intptr_t)ArgPtrs[I]);
286+
JsonArgPtrs.push_back((intptr_t)LaunchParams.Ptrs[I]);
287287
JsonKernelInfo["ArgPtrs"] = json::Value(std::move(JsonArgPtrs));
288288

289289
json::Array JsonArgOffsets;
@@ -549,7 +549,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
549549
if (!KernelLaunchEnvOrErr)
550550
return KernelLaunchEnvOrErr.takeError();
551551

552-
void *KernelArgsPtr =
552+
KernelLaunchParamsTy LaunchParams =
553553
prepareArgs(GenericDevice, ArgPtrs, ArgOffsets, KernelArgs.NumArgs, Args,
554554
Ptrs, *KernelLaunchEnvOrErr);
555555

@@ -564,7 +564,7 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
564564
if (RecordReplay.isRecording()) {
565565
RecordReplay.saveImage(getName(), getImage());
566566
RecordReplay.saveKernelInput(getName(), getImage());
567-
RecordReplay.saveKernelDescr(getName(), Ptrs.data(), KernelArgs.NumArgs,
567+
RecordReplay.saveKernelDescr(getName(), LaunchParams, KernelArgs.NumArgs,
568568
NumBlocks, NumThreads, KernelArgs.Tripcount);
569569
}
570570

@@ -573,10 +573,10 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
573573
return Err;
574574

575575
return launchImpl(GenericDevice, NumThreads, NumBlocks, KernelArgs,
576-
KernelArgsPtr, AsyncInfoWrapper);
576+
LaunchParams, AsyncInfoWrapper);
577577
}
578578

579-
void *GenericKernelTy::prepareArgs(
579+
KernelLaunchParamsTy GenericKernelTy::prepareArgs(
580580
GenericDeviceTy &GenericDevice, void **ArgPtrs, ptrdiff_t *ArgOffsets,
581581
uint32_t &NumArgs, llvm::SmallVectorImpl<void *> &Args,
582582
llvm::SmallVectorImpl<void *> &Ptrs,
@@ -585,22 +585,22 @@ void *GenericKernelTy::prepareArgs(
585585
NumArgs += KLEOffset;
586586

587587
if (NumArgs == 0)
588-
return nullptr;
588+
return KernelLaunchParamsTy{};
589589

590590
Args.resize(NumArgs);
591591
Ptrs.resize(NumArgs);
592592

593593
if (KernelLaunchEnvironment) {
594-
Ptrs[0] = KernelLaunchEnvironment;
595-
Args[0] = &Ptrs[0];
594+
Args[0] = KernelLaunchEnvironment;
595+
Ptrs[0] = &Args[0];
596596
}
597597

598598
for (uint32_t I = KLEOffset; I < NumArgs; ++I) {
599-
Ptrs[I] =
599+
Args[I] =
600600
(void *)((intptr_t)ArgPtrs[I - KLEOffset] + ArgOffsets[I - KLEOffset]);
601-
Args[I] = &Ptrs[I];
601+
Ptrs[I] = &Args[I];
602602
}
603-
return &Args[0];
603+
return KernelLaunchParamsTy{sizeof(void *) * NumArgs, &Args[0], &Ptrs[0]};
604604
}
605605

606606
uint32_t GenericKernelTy::getNumThreads(GenericDeviceTy &GenericDevice,

offload/plugins-nextgen/cuda/src/rtl.cpp

Lines changed: 19 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <string>
1717
#include <unordered_map>
1818

19+
#include "Shared/APITypes.h"
1920
#include "Shared/Debug.h"
2021
#include "Shared/Environment.h"
2122

@@ -149,7 +150,8 @@ struct CUDAKernelTy : public GenericKernelTy {
149150

150151
/// Launch the CUDA kernel function.
151152
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
152-
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
153+
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
154+
KernelLaunchParamsTy LaunchParams,
153155
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;
154156

155157
private:
@@ -1228,9 +1230,10 @@ struct CUDADeviceTy : public GenericDeviceTy {
12281230
AsyncInfoWrapperTy AsyncInfoWrapper(*this, nullptr);
12291231

12301232
KernelArgsTy KernelArgs = {};
1231-
if (auto Err = CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
1232-
/*NumBlocks=*/1ul, KernelArgs, nullptr,
1233-
AsyncInfoWrapper))
1233+
if (auto Err =
1234+
CUDAKernel.launchImpl(*this, /*NumThread=*/1u,
1235+
/*NumBlocks=*/1ul, KernelArgs,
1236+
KernelLaunchParamsTy{}, AsyncInfoWrapper))
12341237
return Err;
12351238

12361239
Error Err = Plugin::success();
@@ -1274,7 +1277,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
12741277

12751278
Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
12761279
uint32_t NumThreads, uint64_t NumBlocks,
1277-
KernelArgsTy &KernelArgs, void *Args,
1280+
KernelArgsTy &KernelArgs,
1281+
KernelLaunchParamsTy LaunchParams,
12781282
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
12791283
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
12801284

@@ -1285,11 +1289,16 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
12851289
uint32_t MaxDynCGroupMem =
12861290
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
12871291

1288-
CUresult Res =
1289-
cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
1290-
/*gridDimZ=*/1, NumThreads,
1291-
/*blockDimY=*/1, /*blockDimZ=*/1, MaxDynCGroupMem, Stream,
1292-
(void **)Args, nullptr);
1292+
void *Config[] = {/* CU_LAUNCH_PARAM_BUFFER_POINTER */ (void *)0x01,
1293+
LaunchParams.Data,
1294+
/* CU_LAUNCH_PARAM_BUFFER_SIZE */ (void *)0x02,
1295+
reinterpret_cast<void *>(&LaunchParams.Size),
1296+
/* CU_LAUNCH_PARAM_END */ (void *)0x00};
1297+
1298+
CUresult Res = cuLaunchKernel(Func, NumBlocks, /*gridDimY=*/1,
1299+
/*gridDimZ=*/1, NumThreads,
1300+
/*blockDimY=*/1, /*blockDimZ=*/1,
1301+
MaxDynCGroupMem, Stream, nullptr, Config);
12931302
return Plugin::check(Res, "Error in cuLaunchKernel for '%s': %s", getName());
12941303
}
12951304

offload/plugins-nextgen/host/src/rtl.cpp

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,8 @@ struct GenELF64KernelTy : public GenericKernelTy {
9090

9191
/// Launch the kernel using the libffi.
9292
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads,
93-
uint64_t NumBlocks, KernelArgsTy &KernelArgs, void *Args,
93+
uint64_t NumBlocks, KernelArgsTy &KernelArgs,
94+
KernelLaunchParamsTy LaunchParams,
9495
AsyncInfoWrapperTy &AsyncInfoWrapper) const override {
9596
// Create a vector of ffi_types, one per argument.
9697
SmallVector<ffi_type *, 16> ArgTypes(KernelArgs.NumArgs, &ffi_type_pointer);
@@ -105,7 +106,7 @@ struct GenELF64KernelTy : public GenericKernelTy {
105106

106107
// Call the kernel function through libffi.
107108
long Return;
108-
ffi_call(&Cif, Func, &Return, (void **)Args);
109+
ffi_call(&Cif, Func, &Return, (void **)LaunchParams.Ptrs);
109110

110111
return Plugin::success();
111112
}

0 commit comments

Comments
 (0)