intel
diff --git a/‎.github/CODEOWNERS
+3 b/‎.github/CODEOWNERS
+3
diff --git a/‎.github/workflows/sycl-nightly.yml
+2-2 b/‎.github/workflows/sycl-nightly.yml
+2-2
diff --git a/‎.github/workflows/sycl-rel-nightly.yml
+12-10 b/‎.github/workflows/sycl-rel-nightly.yml
+12-10
diff --git a/‎.github/workflows/sycl-windows-run-tests.yml
+8-9 b/‎.github/workflows/sycl-windows-run-tests.yml
+8-9
diff --git a/‎clang/include/clang/Basic/LangOptions.def
+4 b/‎clang/include/clang/Basic/LangOptions.def
+4
diff --git a/‎clang/include/clang/Driver/Options.td
+7 b/‎clang/include/clang/Driver/Options.td
+7
diff --git a/‎clang/include/clang/Sema/SemaBase.h
+1 b/‎clang/include/clang/Sema/SemaBase.h
+1
diff --git a/‎clang/include/clang/Sema/SemaCUDA.h
+3 b/‎clang/include/clang/Sema/SemaCUDA.h
+3
diff --git a/‎clang/lib/Basic/LangOptions.cpp
+1-1 b/‎clang/lib/Basic/LangOptions.cpp
+1-1
diff --git a/‎clang/lib/Basic/Targets/NVPTX.cpp
+5-3 b/‎clang/lib/Basic/Targets/NVPTX.cpp
+5-3
diff --git a/‎clang/lib/CodeGen/CodeGenFunction.cpp
+16-11 b/‎clang/lib/CodeGen/CodeGenFunction.cpp
+16-11
diff --git a/‎clang/lib/Driver/ToolChains/Clang.cpp
+32-1 b/‎clang/lib/Driver/ToolChains/Clang.cpp
+32-1
diff --git a/‎clang/lib/Frontend/CompilerInvocation.cpp
+3 b/‎clang/lib/Frontend/CompilerInvocation.cpp
+3
diff --git a/‎clang/lib/Frontend/InitPreprocessor.cpp
+7-2 b/‎clang/lib/Frontend/InitPreprocessor.cpp
+7-2
diff --git a/‎clang/lib/Sema/Sema.cpp
+11-1 b/‎clang/lib/Sema/Sema.cpp
+11-1
@@ -128,6 +128,9 @@ devops/ @intel/dpcpp-devops-reviewers
 # dev-igc driver update
 devops/dependencies-igc-dev.json @intel/sycl-matrix-reviewers @intel/dpcpp-esimd-reviewers @intel/dpcpp-devops-reviewers
 
+# Benchmarking scripts
+devops/scripts/benchmarks/ @intel/llvm-reviewers-benchmarking
+
 # Kernel fusion JIT compiler
 sycl-jit/ @intel/dpcpp-kernel-fusion-reviewers
 sycl/doc/design/KernelFusionJIT.md @intel/dpcpp-kernel-fusion-reviewers
 
@@ -257,7 +257,7 @@ jobs:
       runner: '["Windows", "build-e2e"]'
       cts_testing_mode: 'build-only'
       tests_selector: cts
-      ref: ${{ github.sha }}
+      repo_ref: ${{ github.sha }}
       sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }}
       sycl_cts_artifact: sycl_cts_bin_win
 
@@ -278,7 +278,7 @@ jobs:
       cts_testing_mode: 'run-only'
       target_devices: ${{ matrix.target_devices }}
       tests_selector: cts
-      ref: ${{ github.sha }}
+      repo_ref: ${{ github.sha }}
       sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }}
       sycl_cts_artifact: sycl_cts_bin_win
 
 
@@ -20,7 +20,7 @@ jobs:
     steps:
     - uses: actions/checkout@v4
       with:
-        ref: sycl-rel-6_0_0
+        ref: sycl-rel-6_1_0
     - run: git show --quiet | tee -a $GITHUB_STEP_SUMMARY
 
     - id: is_new_commit
@@ -40,7 +40,7 @@ jobs:
       build_artifact_suffix: default
       build_configure_extra_args: '--hip --cuda'
       build_image: ghcr.io/intel/llvm/ubuntu2204_build:latest
-      build_ref: sycl-rel-6_0_0
+      build_ref: sycl-rel-6_1_0
 
       # We upload the build for people to download/use, override its name and
       # prefer widespread gzip compression.
@@ -89,7 +89,7 @@ jobs:
       tests_selector: ${{ matrix.tests_selector }}
       extra_lit_opts: ${{ matrix.extra_lit_opts }}
       reset_intel_gpu: ${{ matrix.reset_intel_gpu }}
-      repo_ref: sycl-rel-6_0_0
+      repo_ref: sycl-rel-6_1_0
       devops_ref: sycl
       sycl_toolchain_artifact: sycl_linux_default
       sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
@@ -100,7 +100,7 @@ jobs:
     if: ${{ github.repository == 'intel/llvm' && needs.check_for_new_commits.outputs.is_new_commit != 'false' }}
     uses: ./.github/workflows/sycl-windows-build.yml
     with:
-      ref: sycl-rel-6_0_0
+      ref: sycl-rel-6_1_0
 
       # We upload both Linux/Windows build via Github's "Releases"
       # functionality, make sure Linux/Windows names follow the same pattern.
@@ -119,7 +119,7 @@ jobs:
       runner: '["Windows","gen12"]'
       sycl_toolchain_archive: ${{ needs.build-win.outputs.artifact_archive_name }}
       extra_lit_opts: --param gpu-intel-gen12=True
-      ref: sycl-rel-6_0_0
+      repo_ref: sycl-rel-6_1_0
       devops_ref: sycl
 
   cuda-aws-start:
@@ -129,7 +129,7 @@ jobs:
     secrets: inherit
     with:
       mode: start
-      ref: sycl-rel-6_0_0
+      ref: sycl-rel-6_1_0
 
   cuda-run-tests:
     needs: [ubuntu2204_build, cuda-aws-start]
@@ -141,7 +141,7 @@ jobs:
       image: ghcr.io/intel/llvm/ubuntu2204_build:latest
       image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1
       target_devices: cuda:gpu
-      repo_ref: sycl-rel-6_0_0
+      repo_ref: sycl-rel-6_1_0
       devops_ref: sycl
 
       sycl_toolchain_artifact: sycl_linux_default
@@ -155,7 +155,7 @@ jobs:
     secrets: inherit
     with:
       mode: stop
-      ref: sycl-rel-6_0_0
+      ref: sycl-rel-6_1_0
 
   build-sycl-cts:
     needs: ubuntu2204_build
@@ -167,11 +167,13 @@ jobs:
       cts_testing_mode: 'build-only'
       image_options: -u 1001 --device=/dev/dri --privileged --cap-add SYS_ADMIN
       tests_selector: cts
-      repo_ref: sycl-rel-6_0_0
+      repo_ref: sycl-rel-6_1_0
       devops_ref: sycl
+      tests_ref: ead7474b9cb2189ce48025550912ccad5a72bd30
       sycl_toolchain_artifact: sycl_linux_default
       sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
       sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
+      sycl_cts_artifact: sycl_cts_bin_linux
 
   run-sycl-cts:
     needs: [ubuntu2204_build, build-sycl-cts]
@@ -202,4 +204,4 @@ jobs:
       sycl_toolchain_artifact: sycl_linux_default
       sycl_toolchain_archive: ${{ needs.ubuntu2204_build.outputs.artifact_archive_name }}
       sycl_toolchain_decompress_command: ${{ needs.ubuntu2204_build.outputs.artifact_decompress_command }}
-      sycl_cts_artifact: sycl_cts_bin
+      sycl_cts_artifact: sycl_cts_bin_linux
@@ -6,6 +6,7 @@ on:
       name:
         type: string
         required: True
+
       runner:
         type: string
         required: True
@@ -27,19 +28,17 @@ on:
           Extra options to be added to LIT_OPTS.
         type: string
         default: ''
-      ref:
+
+      repo_ref:
         type: string
         required: False
+        description: |
+          Commit SHA or branch to checkout the intel/llvm repo.
       devops_ref:
         type: string
         required: False
         description: |
-          By default we checkout the devops directory from "inputs.ref" branch.
-          devops_ref may be specified to checkout the devops dir from different
-          branch.
-          Note: it doesn't affect ./devops/actions/run-tests/* as these actions
-          call checkout again and therefore override the devops directory, so
-          configs/dependecies from input.ref are used.
+          Commit SHA or branch to checkout the devops directory.
       tests_ref:
         type: string
         required: False
@@ -104,7 +103,7 @@ jobs:
       with:
         sparse-checkout: |
           devops/actions
-        ref: ${{ inputs.devops_ref|| inputs.ref || github.sha }}
+        ref: ${{ inputs.devops_ref|| inputs.repo_ref || github.sha }}
     - uses: ilammy/msvc-dev-cmd@0b201ec74fa43914dc39ae48a89fd1d8cb592756
       with:
         arch: amd64
@@ -122,7 +121,7 @@ jobs:
       if: inputs.tests_selector == 'e2e'
       with:
         path: llvm
-        ref: ${{ inputs.ref || github.sha }}
+        ref: ${{ inputs.repo_ref || github.sha }}
         cache_path: "D:\\\\github\\\\_work\\\\repo_cache\\\\"
     - name: Download compiler toolchain
       uses: actions/download-artifact@v4
 
@@ -318,6 +318,10 @@ LANGOPT(
     "SYCL compiler assumes value fits within MAX_INT for member function of "
     "get/operator[], get_id/operator[] and get_global_id/get_global_linear_id "
     "in SYCL class id, iterm and nd_iterm")
+LANGOPT(SYCLCUDACompat, 1, 0,
+        "Enable CUDA definitions and implicit includes when building for the "
+        "NVPTX backend. This mode can help SYCL program to run using the CUDA "
+        "infrastructure on Nvidia's platforms. ")
 ENUM_LANGOPT(SYCLRangeRounding, SYCLRangeRoundingPreference, 2,
     SYCLRangeRoundingPreference::On,
     "Preference for SYCL parallel_for range rounding")
 
@@ -7031,6 +7031,13 @@ defm sycl_decompose_functor
           NegFlag<SetFalse, [], [ClangOption, CLOption], "Do not">,
           BothFlags<[], [ClangOption, CLOption, CC1Option],
            " decompose SYCL functor if possible (experimental, CUDA only)">>;
+defm sycl_cuda_compat
+    : BoolFOption<"sycl-cuda-compatibility", LangOpts<"SYCLCUDACompat">, DefaultFalse,
+          PosFlag<SetTrue, [], [ClangOption, CLOption, CC1Option], "Enable CUDA compatibility mode (experimental). "
+          "Enable the use of CUDA device code with SYCL device code. "
+          "Under this mode, a SYCL device function can call a CUDA device function (but not the other way around). "
+          "This implies the definition of CUDA macros and the inclusion of implicit header files.">,
+          NegFlag<SetFalse, [], [ClangOption, CLOption, CC1Option], "Disable CUDA compatibility mode.">>;
 def flink_huge_device_code : Flag<["-"], "flink-huge-device-code">,
   HelpText<"Generate and use a custom linker script for huge device code "
            "sections">;
 
@@ -110,6 +110,7 @@ class SemaBase {
     CudaAll = CudaDevice | CudaHost,
     /// SYCL specific diagnostic.
     Sycl = 1 << 4,
+    SyclCudaCompat = Sycl | CudaAll,
     /// ESIMD specific diagnostic.
     Esimd = 1 << 5,
     /// A flag representing 'all'.  This can be used to avoid the check
 
@@ -157,6 +157,9 @@ class SemaCUDA : public SemaBase {
 
   // CUDA function call preference. Must be ordered numerically from
   // worst to best.
+  // Note: in SYCL-CUDA compatibility mode: Native, SameSide and HostDevice
+  // doesn't follow the naming, only the ranking system (e.g. 1st, 2nd or 3rd
+  // choice). See table near IdentifyPreference.
   enum CUDAFunctionPreference {
     CFP_Never,      // Invalid caller/callee combination.
     CFP_WrongSide,  // Calls from host-device to host or device
 
@@ -183,7 +183,7 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,
   }
 
   Opts.HIP = Lang == Language::HIP;
-  Opts.CUDA = Lang == Language::CUDA || Opts.HIP;
+  Opts.CUDA = Lang == Language::CUDA || Opts.HIP || Opts.SYCLCUDACompat;
   if (Opts.HIP) {
     // HIP toolchain does not support 'Fast' FPOpFusion in backends since it
     // fuses multiplication/addition instructions without contract flag from
 
@@ -294,11 +294,13 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       llvm_unreachable("unhandled OffloadArch");
     }();
 
-    if (Opts.SYCLIsDevice) {
+    if (Opts.SYCLIsDevice)
       Builder.defineMacro("__SYCL_CUDA_ARCH__", CUDAArchCode);
-    } else {
+    // Don't define __CUDA_ARCH__ if in SYCL device mode unless we are in
+    // SYCL-CUDA compatibility mode.
+    // For all other cases, define the macro.
+    if (!Opts.SYCLIsDevice || Opts.SYCLCUDACompat)
       Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
-    }
     if (GPU == OffloadArch::SM_90a)
       Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
     if (GPU == OffloadArch::SM_100a)
 
@@ -1858,16 +1858,6 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
   if (Body && isa_and_nonnull<CoroutineBodyStmt>(Body))
     llvm::append_range(FnArgs, FD->parameters());
 
-  // Generate a dummy __host__ function for compiling CUDA sources in SYCL.
-  if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
-      getLangOpts().SYCLIsHost && !FD->hasAttr<CUDAHostAttr>() &&
-      FD->hasAttr<CUDADeviceAttr>()) {
-    if (FD->getReturnType()->isVoidType())
-      Builder.CreateRetVoid();
-    else
-      Builder.CreateRet(llvm::UndefValue::get(Fn->getReturnType()));
-    return;
-  }
   // When compiling a CUDA file in SYCL device mode,
   // set weak ODR linkage for possibly duplicated functions.
   if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
@@ -1884,7 +1874,22 @@ void CodeGenFunction::GenerateCode(GlobalDecl GD, llvm::Function *Fn,
 
   // Generate the body of the function.
   PGO.assignRegionCounters(GD, CurFn);
-  if (isa<CXXDestructorDecl>(FD))
+  if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice &&
+      getLangOpts().SYCLIsHost && !FD->hasAttr<CUDAHostAttr>() &&
+      FD->hasAttr<CUDADeviceAttr>()) {
+    // SYCL host compilation with CUDA compatibility enabled requires
+    // the creation of a host stub function for functions declared with
+    // the __device__ specifier but without the __host__ specifier.
+    // This is caused by the fact that SYCL doesn't use specifier like CUDA and
+    // so may have what can appear to be call from host to device. As we can't
+    // prevent the emission of such call, we need to produce a symbol for
+    // function with the __device__.
+    if (FD->getReturnType()->isVoidType())
+      Builder.CreateRetVoid();
+    else
+      Builder.CreateRet(llvm::UndefValue::get(Fn->getReturnType()));
+    Builder.ClearInsertionPoint();
+  } else if (isa<CXXDestructorDecl>(FD))
     EmitDestructorBody(Args);
   else if (isa<CXXConstructorDecl>(FD))
     EmitConstructorBody(Args);
 
@@ -75,6 +75,11 @@ using namespace clang::driver::tools;
 using namespace clang;
 using namespace llvm::opt;
 
+static bool isSYCLCudaCompatEnabled(const ArgList &Args) {
+  return Args.hasFlag(options::OPT_fsycl_cuda_compat,
+                      options::OPT_fno_sycl_cuda_compat, false);
+}
+
 static void CheckPreprocessingOptions(const Driver &D, const ArgList &Args) {
   if (Arg *A = Args.getLastArg(clang::driver::options::OPT_C, options::OPT_CC,
                                options::OPT_fminimize_whitespace,
@@ -1176,7 +1181,8 @@ void Clang::AddPreprocessingOptions(Compilation &C, const JobAction &JA,
 
   if (JA.isOffloading(Action::OFK_SYCL)) {
     getToolChain().addSYCLIncludeArgs(Args, CmdArgs);
-    if (Inputs[0].getType() == types::TY_CUDA) {
+    if (Inputs[0].getType() == types::TY_CUDA ||
+        isSYCLCudaCompatEnabled(Args)) {
       // Include __clang_cuda_runtime_wrapper.h in .cu SYCL compilation.
       getToolChain().AddCudaIncludeArgs(Args, CmdArgs);
     }
@@ -5463,6 +5469,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   bool IsFPGASYCLOffloadDevice =
       IsSYCLDevice && Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga;
   const bool IsSYCLNativeCPU = isSYCLNativeCPU(TC);
+  const bool IsSYCLCUDACompat = isSYCLCudaCompatEnabled(Args);
 
   // Perform the SYCL host compilation using an external compiler if the user
   // requested.
@@ -5832,6 +5839,17 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-fno-sycl-esimd-build-host-code");
     }
 
+    if (IsSYCLCUDACompat) {
+      Args.addOptInFlag(CmdArgs, options::OPT_fsycl_cuda_compat,
+                        options::OPT_fno_sycl_cuda_compat);
+      // FIXME: clang's CUDA headers require this ...
+      // remove when clang/lib/Headers/__clang_cuda_builtin_vars.h no longer
+      // requires it.
+      CmdArgs.push_back("-fdeclspec");
+      // Note: assumes CUDA 9.0 or more (required by SYCL for CUDA)
+      CmdArgs.push_back("-fcuda-allow-variadic-functions");
+    }
+
     // Set options for both host and device
     if (SYCLStdArg) {
       SYCLStdArg->render(Args, CmdArgs);
@@ -5898,6 +5916,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
     bool HasFPGA = false;
     for (auto TI = SYCLTCRange.first, TE = SYCLTCRange.second; TI != TE; ++TI) {
       llvm::Triple SYCLTriple = TI->second->getTriple();
+      if (SYCLTriple.isNVPTX() && IsSYCLCUDACompat && !IsSYCLDevice) {
+        CmdArgs.push_back("-aux-triple");
+        CmdArgs.push_back(Args.MakeArgString(SYCLTriple.normalize()));
+        // We need to figure out which CUDA version we're compiling for, as that
+        // determines how we load and launch GPU kernels.
+        auto *CTC = static_cast<const toolchains::CudaToolChain *>(TI->second);
+        assert(CTC && "Expected valid CUDA Toolchain.");
+        if (CTC->CudaInstallation.version() != CudaVersion::UNKNOWN)
+          CmdArgs.push_back(Args.MakeArgString(
+              Twine("-target-sdk-version=") +
+              CudaVersionToString(CTC->CudaInstallation.version())));
+        break;
+      }
       if (SYCLTriple.getSubArch() == llvm::Triple::SPIRSubArch_fpga) {
         HasFPGA = true;
         if (!IsSYCLDevice) {
 
@@ -4198,6 +4198,9 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   Opts.IncludeDefaultHeader = Args.hasArg(OPT_finclude_default_header);
   Opts.DeclareOpenCLBuiltins = Args.hasArg(OPT_fdeclare_opencl_builtins);
 
+  Opts.SYCLCUDACompat =
+      Args.hasArg(OPT_fsycl_cuda_compat, OPT_fno_sycl_cuda_compat, false);
+
   LangOptions::setLangDefaults(Opts, IK.getLanguage(), T, Includes, LangStd);
 
   // The key paths of codegen options defined in Options.td start with
 
@@ -1511,10 +1511,15 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   }
 
   // CUDA device path compilaton
-  if (LangOpts.CUDAIsDevice && !LangOpts.HIP && !LangOpts.isSYCL()) {
+  // Enabled if CUDA device compilation mode is on unless HIP is
+  // active or SYCL is active without CUDA compatibility enabled.
+  bool EnableCUDADevicePath = LangOpts.CUDAIsDevice && !LangOpts.HIP &&
+                              (!LangOpts.isSYCL() || LangOpts.SYCLCUDACompat);
+  if (EnableCUDADevicePath) {
     // The CUDA_ARCH value is set for the GPU target specified in the NVPTX
     // backend's target defines.
-    // Note: SYCL targeting nvptx-cuda relies on __SYCL_CUDA_ARCH__ instead.
+    // Note: SYCL targeting nvptx-cuda without SYCL-CUDA compatibility relies on
+    // __SYCL_CUDA_ARCH__ only instead.
     Builder.defineMacro("__CUDA_ARCH__");
   }
 
 
@@ -2093,9 +2093,19 @@ Sema::targetDiag(SourceLocation Loc, unsigned DiagID, const FunctionDecl *FD) {
     return LangOpts.OpenMPIsTargetDevice
                ? OpenMP().diagIfOpenMPDeviceCode(Loc, DiagID, FD)
                : OpenMP().diagIfOpenMPHostCode(Loc, DiagID, FD);
-  if (getLangOpts().CUDA)
+
+  // If SYCLCUDACompat is active, use the SYCL logic instead of CUDA when
+  // compiling the device side but the CUDA logic when compiling the host side.
+  // When compiling the device side, we need this as CUDA looks for the presence
+  // of __device__, __host__ etc. attributes to emit or defer diagnostics. These
+  // aren't always there as SYCL doesn't use such attribute.
+  if (getLangOpts().CUDA && !getLangOpts().SYCLCUDACompat)
     return getLangOpts().CUDAIsDevice ? CUDA().DiagIfDeviceCode(Loc, DiagID)
                                       : CUDA().DiagIfHostCode(Loc, DiagID);
+  // On the host side, __device__ acts as a guard like __SYCL_DEVICE_ONLY__
+  // macro, so use the CUDA logic here.
+  if (getLangOpts().SYCLIsHost && getLangOpts().SYCLCUDACompat)
+    return CUDA().DiagIfHostCode(Loc, DiagID);
 
   if (getLangOpts().SYCLIsDevice)
     return SYCL().DiagIfDeviceCode(Loc, DiagID);
Original file line number	Diff line number	Diff line change
`@@ -183,7 +183,7 @@ void LangOptions::setLangDefaults(LangOptions &Opts, Language Lang,`
`183`	`183`	`}`
`184`	`184`
`185`	`185`	`Opts.HIP = Lang == Language::HIP;`
`186`		`- Opts.CUDA = Lang == Language::CUDA \|\| Opts.HIP;`
	`186`	`+ Opts.CUDA = Lang == Language::CUDA \|\| Opts.HIP \|\| Opts.SYCLCUDACompat;`
`187`	`187`	`if (Opts.HIP) {`
`188`	`188`	`// HIP toolchain does not support 'Fast' FPOpFusion in backends since it`
`189`	`189`	`// fuses multiplication/addition instructions without contract flag from`