diff --git a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs index a60e2d1f0ab09..9c868f6d3b920 100644 --- a/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs +++ b/compiler/rustc_codegen_llvm/src/builder/gpu_offload.rs @@ -3,11 +3,12 @@ use std::ffi::CString; use bitflags::Flags; use llvm::Linkage::*; use rustc_abi::Align; +use rustc_codegen_ssa::MemFlags; use rustc_codegen_ssa::common::TypeKind; use rustc_codegen_ssa::mir::operand::{OperandRef, OperandValue}; use rustc_codegen_ssa::traits::{BaseTypeCodegenMethods, BuilderMethods}; use rustc_middle::bug; -use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata}; +use rustc_middle::ty::offload_meta::{MappingFlags, OffloadMetadata, OffloadSize}; use crate::builder::Builder; use crate::common::CodegenCx; @@ -450,7 +451,15 @@ pub(crate) fn gen_define_handling<'ll>( // FIXME(offload): add `OMP_MAP_TARGET_PARAM = 0x20` only if necessary let transfer_kernel = vec![MappingFlags::TARGET_PARAM.bits(); transfer_to.len()]; - let offload_sizes = add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &sizes); + let actual_sizes = sizes + .iter() + .map(|s| match s { + OffloadSize::Static(sz) => *sz, + OffloadSize::Dynamic => 0, + }) + .collect::>(); + let offload_sizes = + add_priv_unnamed_arr(&cx, &format!(".offload_sizes.{symbol}"), &actual_sizes); let memtransfer_begin = add_priv_unnamed_arr(&cx, &format!(".offload_maptypes.{symbol}.begin"), &transfer_to); let memtransfer_kernel = @@ -499,9 +508,6 @@ pub(crate) fn gen_define_handling<'ll>( region_id, }; - // FIXME(Sa4dUs): use this global for constant offload sizes - cx.add_compiler_used_global(result.offload_sizes); - cx.offload_kernel_cache.borrow_mut().insert(symbol, result); result @@ -535,6 +541,15 @@ pub(crate) fn scalar_width<'ll>(cx: &'ll SimpleCx<'_>, ty: &'ll Type) -> u64 { } } +fn get_runtime_size<'ll, 'tcx>( + _cx: &CodegenCx<'ll, 'tcx>, + _val: &'ll Value, + _meta: &OffloadMetadata, +) -> &'ll Value { + // FIXME(Sa4dUs): handle dynamic-size data (e.g. slices) + bug!("offload does not support dynamic sizes yet"); +} + // For each kernel *call*, we now use some of our previous declared globals to move data to and from // the gpu. For now, we only handle the data transfer part of it. // If two consecutive kernels use the same memory, we still move it to the host and back to the gpu. @@ -564,15 +579,17 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( ) { let cx = builder.cx; let OffloadKernelGlobals { + offload_sizes, memtransfer_begin, memtransfer_kernel, memtransfer_end, region_id, - .. } = offload_data; let OffloadKernelDims { num_workgroups, threads_per_block, workgroup_dims, thread_dims } = offload_dims; + let has_dynamic = metadata.iter().any(|m| matches!(m.payload_size, OffloadSize::Dynamic)); + let tgt_decl = offload_globals.launcher_fn; let tgt_target_kernel_ty = offload_globals.launcher_ty; @@ -596,7 +613,24 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( let a2 = builder.direct_alloca(ty, Align::EIGHT, ".offload_ptrs"); // These represent the sizes in bytes, e.g. the entry for `&[f64; 16]` will be 8*16. let ty2 = cx.type_array(cx.type_i64(), num_args); - let a4 = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes"); + + let a4 = if has_dynamic { + let alloc = builder.direct_alloca(ty2, Align::EIGHT, ".offload_sizes"); + + builder.memcpy( + alloc, + Align::EIGHT, + offload_sizes, + Align::EIGHT, + cx.get_const_i64(8 * args.len() as u64), + MemFlags::empty(), + None, + ); + + alloc + } else { + offload_sizes + }; //%kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 let a5 = builder.direct_alloca(tgt_kernel_decl, Align::EIGHT, "kernel_args"); @@ -648,9 +682,12 @@ pub(crate) fn gen_call_handling<'ll, 'tcx>( builder.store(vals[i as usize], gep1, Align::EIGHT); let gep2 = builder.inbounds_gep(ty, a2, &[i32_0, idx]); builder.store(geps[i as usize], gep2, Align::EIGHT); - let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]); - // FIXME(offload): write an offload frontend and handle arbitrary types. - builder.store(cx.get_const_i64(metadata[i as usize].payload_size), gep3, Align::EIGHT); + + if matches!(metadata[i as usize].payload_size, OffloadSize::Dynamic) { + let gep3 = builder.inbounds_gep(ty2, a4, &[i32_0, idx]); + let size_val = get_runtime_size(cx, args[i as usize], &metadata[i as usize]); + builder.store(size_val, gep3, Align::EIGHT); + } } // For now we have a very simplistic indexing scheme into our diff --git a/compiler/rustc_middle/src/ty/offload_meta.rs b/compiler/rustc_middle/src/ty/offload_meta.rs index 67c00765ed57b..849670d76d464 100644 --- a/compiler/rustc_middle/src/ty/offload_meta.rs +++ b/compiler/rustc_middle/src/ty/offload_meta.rs @@ -3,10 +3,16 @@ use bitflags::bitflags; use crate::ty::{self, PseudoCanonicalInput, Ty, TyCtxt, TypingEnv}; pub struct OffloadMetadata { - pub payload_size: u64, + pub payload_size: OffloadSize, pub mode: MappingFlags, } +#[derive(Debug, Copy, Clone)] +pub enum OffloadSize { + Dynamic, + Static(u64), +} + bitflags! { /// Mirrors `OpenMPOffloadMappingFlags` from Clang/OpenMP. #[derive(Debug, Copy, Clone)] @@ -59,17 +65,18 @@ impl OffloadMetadata { } // FIXME(Sa4dUs): implement a solid logic to determine the payload size -fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> u64 { +fn get_payload_size<'tcx>(tcx: TyCtxt<'tcx>, ty: Ty<'tcx>) -> OffloadSize { match ty.kind() { ty::RawPtr(inner, _) | ty::Ref(_, inner, _) => get_payload_size(tcx, *inner), - _ => tcx - .layout_of(PseudoCanonicalInput { + _ => OffloadSize::Static( + tcx.layout_of(PseudoCanonicalInput { typing_env: TypingEnv::fully_monomorphized(), value: ty, }) .unwrap() .size .bytes(), + ), } } diff --git a/tests/codegen-llvm/gpu_offload/control_flow.rs b/tests/codegen-llvm/gpu_offload/control_flow.rs index fb483db667b2a..503e9e4221cdc 100644 --- a/tests/codegen-llvm/gpu_offload/control_flow.rs +++ b/tests/codegen-llvm/gpu_offload/control_flow.rs @@ -14,14 +14,13 @@ // CHECK-NOT: define // CHECK: %.offload_baseptrs = alloca [1 x ptr], align 8 // CHECK-NEXT: %.offload_ptrs = alloca [1 x ptr], align 8 -// CHECK-NEXT: %.offload_sizes = alloca [1 x i64], align 8 // CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 // CHECK: br label %bb3 // CHECK-NOT define // CHECK: bb3 -// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null) +// CHECK: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.begin, ptr null, ptr null) // CHECK: %10 = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.foo.region_id, ptr nonnull %kernel_args) -// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null) +// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 1, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.foo, ptr nonnull @.offload_maptypes.foo.end, ptr null, ptr null) #[unsafe(no_mangle)] unsafe fn main() { let A = [1.0, 1.0, 1.0, 1.0, 1.0, 1.0]; diff --git a/tests/codegen-llvm/gpu_offload/gpu_host.rs b/tests/codegen-llvm/gpu_offload/gpu_host.rs index f25ba679abbdf..8179d868da95f 100644 --- a/tests/codegen-llvm/gpu_offload/gpu_host.rs +++ b/tests/codegen-llvm/gpu_offload/gpu_host.rs @@ -58,18 +58,14 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) { // CHECK-NEXT: %x = alloca [1024 x i8], align 16 // CHECK-NEXT: %.offload_baseptrs = alloca [2 x ptr], align 8 // CHECK-NEXT: %.offload_ptrs = alloca [2 x ptr], align 8 -// CHECK-NEXT: %.offload_sizes = alloca [2 x i64], align 8 // CHECK-NEXT: %kernel_args = alloca %struct.__tgt_kernel_arguments, align 8 // CHECK: store ptr %x, ptr %.offload_baseptrs, align 8 // CHECK-NEXT: store ptr %x, ptr %.offload_ptrs, align 8 -// CHECK-NEXT: store i64 1024, ptr %.offload_sizes, align 8 // CHECK-NEXT: [[BPTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_baseptrs, i64 8 // CHECK-NEXT: store ptr %y, ptr [[BPTRS_1]], align 8 // CHECK-NEXT: [[PTRS_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8 // CHECK-NEXT: store ptr %y, ptr [[PTRS_1]], align 8 -// CHECK-NEXT: [[SIZES_1:%.*]] = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8 -// CHECK-NEXT: store i64 1024, ptr [[SIZES_1]], align 8 -// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null) +// CHECK-NEXT: call void @__tgt_target_data_begin_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].begin, ptr null, ptr null) // CHECK-NEXT: store i32 3, ptr %kernel_args, align 8 // CHECK-NEXT: [[P4:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 4 // CHECK-NEXT: store i32 2, ptr [[P4]], align 4 @@ -78,7 +74,7 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) { // CHECK-NEXT: [[P16:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 16 // CHECK-NEXT: store ptr %.offload_ptrs, ptr [[P16]], align 8 // CHECK-NEXT: [[P24:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 24 -// CHECK-NEXT: store ptr %.offload_sizes, ptr [[P24]], align 8 +// CHECK-NEXT: store ptr @.offload_sizes.[[K]], ptr [[P24]], align 8 // CHECK-NEXT: [[P32:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 32 // CHECK-NEXT: store ptr @.offload_maptypes.[[K]].kernel, ptr [[P32]], align 8 // CHECK-NEXT: [[P40:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 40 @@ -92,7 +88,7 @@ pub fn _kernel_1(x: &mut [f32; 256], y: &[f32; 256]) { // CHECK-NEXT: [[P96:%[^ ]+]] = getelementptr inbounds nuw i8, ptr %kernel_args, i64 96 // CHECK-NEXT: store i32 0, ptr [[P96]], align 8 // CHECK-NEXT: [[TGT_RET:%.*]] = call i32 @__tgt_target_kernel(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 256, i32 32, ptr nonnull @.[[K]].region_id, ptr nonnull %kernel_args) -// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull %.offload_sizes, ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null) +// CHECK-NEXT: call void @__tgt_target_data_end_mapper(ptr nonnull @anon.{{.*}}.1, i64 -1, i32 2, ptr nonnull %.offload_baseptrs, ptr nonnull %.offload_ptrs, ptr nonnull @.offload_sizes.[[K]], ptr nonnull @.offload_maptypes.[[K]].end, ptr null, ptr null) // CHECK: ret void // CHECK-NEXT: } diff --git a/tests/codegen-llvm/gpu_offload/scalar_host.rs b/tests/codegen-llvm/gpu_offload/scalar_host.rs index 8c7dcd4dd5817..d5b40fb0a26db 100644 --- a/tests/codegen-llvm/gpu_offload/scalar_host.rs +++ b/tests/codegen-llvm/gpu_offload/scalar_host.rs @@ -20,8 +20,6 @@ // CHECK-NEXT: store double %_0.i, ptr %1, align 8 // CHECK-NEXT: %2 = getelementptr inbounds nuw i8, ptr %.offload_ptrs, i64 8 // CHECK-NEXT: store ptr %addr, ptr %2, align 8 -// CHECK-NEXT: %3 = getelementptr inbounds nuw i8, ptr %.offload_sizes, i64 8 -// CHECK-NEXT: store i64 4, ptr %3, align 8 // CHECK-NEXT: call void @__tgt_target_data_begin_mapper #[unsafe(no_mangle)]