Skip to content

Commit 3a56b5a

Browse files
authored
[CUDA] Include PTX in non-RDC mode using the new driver (llvm#84367)
Summary: The old driver embed PTX in rdc-mode and so does the `nvcc` compiler. The new drivers currently does not do this, so we should keep it consistent in this case. This simply requires adding the assembler output as an input to the offloading action that gets fed to fatbin.
1 parent 69b8bc7 commit 3a56b5a

File tree

4 files changed

+36
-22
lines changed

4 files changed

+36
-22
lines changed

Diff for: clang/docs/ReleaseNotes.rst

+3
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,9 @@ RISC-V Support
391391
CUDA/HIP Language Changes
392392
^^^^^^^^^^^^^^^^^^^^^^^^^
393393

394+
- PTX is no longer included by default when compiling for CUDA. Using
395+
``--cuda-include-ptx=all`` will return the old behavior.
396+
394397
CUDA Support
395398
^^^^^^^^^^^^
396399

Diff for: clang/lib/Driver/Driver.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -4625,7 +4625,15 @@ Action *Driver::BuildOffloadingActions(Compilation &C,
46254625
DDeps.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
46264626
OffloadAction::DeviceDependences DDep;
46274627
DDep.add(*A, *TCAndArch->first, TCAndArch->second.data(), Kind);
4628+
4629+
// Compiling CUDA in non-RDC mode uses the PTX output if available.
4630+
for (Action *Input : A->getInputs())
4631+
if (Kind == Action::OFK_Cuda && A->getType() == types::TY_Object &&
4632+
!Args.hasFlag(options::OPT_fgpu_rdc, options::OPT_fno_gpu_rdc,
4633+
false))
4634+
DDep.add(*Input, *TCAndArch->first, TCAndArch->second.data(), Kind);
46284635
OffloadActions.push_back(C.MakeAction<OffloadAction>(DDep, A->getType()));
4636+
46294637
++TCAndArch;
46304638
}
46314639
}

Diff for: clang/lib/Driver/ToolChains/Cuda.cpp

+12-10
Original file line numberDiff line numberDiff line change
@@ -503,18 +503,20 @@ void NVPTX::Assembler::ConstructJob(Compilation &C, const JobAction &JA,
503503
Exec, CmdArgs, Inputs, Output));
504504
}
505505

506-
static bool shouldIncludePTX(const ArgList &Args, const char *gpu_arch) {
507-
bool includePTX = true;
508-
for (Arg *A : Args) {
509-
if (!(A->getOption().matches(options::OPT_cuda_include_ptx_EQ) ||
510-
A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ)))
511-
continue;
506+
static bool shouldIncludePTX(const ArgList &Args, StringRef InputArch) {
507+
// The new driver does not include PTX by default to avoid overhead.
508+
bool includePTX = !Args.hasFlag(options::OPT_offload_new_driver,
509+
options::OPT_no_offload_new_driver, false);
510+
for (Arg *A : Args.filtered(options::OPT_cuda_include_ptx_EQ,
511+
options::OPT_no_cuda_include_ptx_EQ)) {
512512
A->claim();
513513
const StringRef ArchStr = A->getValue();
514-
if (ArchStr == "all" || ArchStr == gpu_arch) {
515-
includePTX = A->getOption().matches(options::OPT_cuda_include_ptx_EQ);
516-
continue;
517-
}
514+
if (A->getOption().matches(options::OPT_cuda_include_ptx_EQ) &&
515+
(ArchStr == "all" || ArchStr == InputArch))
516+
includePTX = true;
517+
else if (A->getOption().matches(options::OPT_no_cuda_include_ptx_EQ) &&
518+
(ArchStr == "all" || ArchStr == InputArch))
519+
includePTX = false;
518520
}
519521
return includePTX;
520522
}

Diff for: clang/test/Driver/cuda-phases.cu

+13-12
Original file line numberDiff line numberDiff line change
@@ -244,31 +244,32 @@
244244
// NEW-DRIVER-RDC-NEXT: 18: assembler, {17}, object, (host-cuda)
245245
// NEW-DRIVER-RDC-NEXT: 19: clang-linker-wrapper, {18}, image, (host-cuda)
246246

247-
// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver -fgpu-rdc \
247+
// RUN: %clang -### -target powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver \
248248
// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s 2>&1 | FileCheck --check-prefix=NEW-DRIVER %s
249-
// NEW-DRIVER: 0: input, "[[INPUT:.+]]", cuda
250-
// NEW-DRIVER-NEXT: 1: preprocessor, {0}, cuda-cpp-output
251-
// NEW-DRIVER-NEXT: 2: compiler, {1}, ir
252-
// NEW-DRIVER-NEXT: 3: input, "[[INPUT]]", cuda, (device-cuda, sm_52)
249+
// NEW-DRIVER: 0: input, "[[CUDA:.+]]", cuda, (host-cuda)
250+
// NEW-DRIVER-NEXT: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
251+
// NEW-DRIVER-NEXT: 2: compiler, {1}, ir, (host-cuda)
252+
// NEW-DRIVER-NEXT: 3: input, "[[CUDA]]", cuda, (device-cuda, sm_52)
253253
// NEW-DRIVER-NEXT: 4: preprocessor, {3}, cuda-cpp-output, (device-cuda, sm_52)
254254
// NEW-DRIVER-NEXT: 5: compiler, {4}, ir, (device-cuda, sm_52)
255255
// NEW-DRIVER-NEXT: 6: backend, {5}, assembler, (device-cuda, sm_52)
256256
// NEW-DRIVER-NEXT: 7: assembler, {6}, object, (device-cuda, sm_52)
257-
// NEW-DRIVER-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
258-
// NEW-DRIVER-NEXT: 9: input, "[[INPUT]]", cuda, (device-cuda, sm_70)
257+
// NEW-DRIVER-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {6}, object
258+
// NEW-DRIVER-NEXT: 9: input, "[[CUDA]]", cuda, (device-cuda, sm_70)
259259
// NEW-DRIVER-NEXT: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70)
260260
// NEW-DRIVER-NEXT: 11: compiler, {10}, ir, (device-cuda, sm_70)
261261
// NEW-DRIVER-NEXT: 12: backend, {11}, assembler, (device-cuda, sm_70)
262262
// NEW-DRIVER-NEXT: 13: assembler, {12}, object, (device-cuda, sm_70)
263-
// NEW-DRIVER-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object
264-
// NEW-DRIVER-NEXT: 15: clang-offload-packager, {8, 14}, image
265-
// NEW-DRIVER-NEXT: 16: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (powerpc64le-ibm-linux-gnu)" {15}, ir
263+
// NEW-DRIVER-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {12}, object
264+
// NEW-DRIVER-NEXT: 15: linker, {8, 14}, cuda-fatbin, (device-cuda)
265+
// NEW-DRIVER-NEXT: 16: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {15}, ir
266266
// NEW-DRIVER-NEXT: 17: backend, {16}, assembler, (host-cuda)
267267
// NEW-DRIVER-NEXT: 18: assembler, {17}, object, (host-cuda)
268268
// NEW-DRIVER-NEXT: 19: clang-linker-wrapper, {18}, image, (host-cuda)
269269

270270
// RUN: %clang -### --target=powerpc64le-ibm-linux-gnu -ccc-print-phases --offload-new-driver \
271271
// RUN: --offload-arch=sm_52 --offload-arch=sm_70 %s %S/Inputs/empty.cpp 2>&1 | FileCheck --check-prefix=NON-CUDA-INPUT %s
272+
272273
// NON-CUDA-INPUT: 0: input, "[[CUDA:.+]]", cuda, (host-cuda)
273274
// NON-CUDA-INPUT-NEXT: 1: preprocessor, {0}, cuda-cpp-output, (host-cuda)
274275
// NON-CUDA-INPUT-NEXT: 2: compiler, {1}, ir, (host-cuda)
@@ -277,13 +278,13 @@
277278
// NON-CUDA-INPUT-NEXT: 5: compiler, {4}, ir, (device-cuda, sm_52)
278279
// NON-CUDA-INPUT-NEXT: 6: backend, {5}, assembler, (device-cuda, sm_52)
279280
// NON-CUDA-INPUT-NEXT: 7: assembler, {6}, object, (device-cuda, sm_52)
280-
// NON-CUDA-INPUT-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, object
281+
// NON-CUDA-INPUT-NEXT: 8: offload, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {7}, "device-cuda (nvptx64-nvidia-cuda:sm_52)" {6}, object
281282
// NON-CUDA-INPUT-NEXT: 9: input, "[[CUDA]]", cuda, (device-cuda, sm_70)
282283
// NON-CUDA-INPUT-NEXT: 10: preprocessor, {9}, cuda-cpp-output, (device-cuda, sm_70)
283284
// NON-CUDA-INPUT-NEXT: 11: compiler, {10}, ir, (device-cuda, sm_70)
284285
// NON-CUDA-INPUT-NEXT: 12: backend, {11}, assembler, (device-cuda, sm_70)
285286
// NON-CUDA-INPUT-NEXT: 13: assembler, {12}, object, (device-cuda, sm_70)
286-
// NON-CUDA-INPUT-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, object
287+
// NON-CUDA-INPUT-NEXT: 14: offload, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {13}, "device-cuda (nvptx64-nvidia-cuda:sm_70)" {12}, object
287288
// NON-CUDA-INPUT-NEXT: 15: linker, {8, 14}, cuda-fatbin, (device-cuda)
288289
// NON-CUDA-INPUT-NEXT: 16: offload, "host-cuda (powerpc64le-ibm-linux-gnu)" {2}, "device-cuda (nvptx64-nvidia-cuda)" {15}, ir
289290
// NON-CUDA-INPUT-NEXT: 17: backend, {16}, assembler, (host-cuda)

0 commit comments

Comments
 (0)