diff --git a/compiler/rustc_codegen_gcc/src/asm.rs b/compiler/rustc_codegen_gcc/src/asm.rs index f237861b1595a..1dd0f2ed58bd8 100644 --- a/compiler/rustc_codegen_gcc/src/asm.rs +++ b/compiler/rustc_codegen_gcc/src/asm.rs @@ -665,6 +665,8 @@ fn reg_class_to_gcc(reg_class: InlineAsmRegClass) -> &'static str { InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::vgpr) => "v", + InlineAsmRegClass::Amdgpu(AmdgpuInlineAsmRegClass::sgpr) => "Sg", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => "r", InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::dreg_low16) @@ -761,6 +763,7 @@ fn dummy_output_type<'gcc, 'tcx>(cx: &CodegenCx<'gcc, 'tcx>, reg: InlineAsmRegCl InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(_) => cx.type_i32(), InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => cx.type_i32(), InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => cx.type_f32(), @@ -946,6 +949,7 @@ fn modifier_to_gcc( InlineAsmRegClass::AArch64(AArch64InlineAsmRegClass::preg) => { unreachable!("clobber-only") } + InlineAsmRegClass::Amdgpu(_) => None, InlineAsmRegClass::Arm(ArmInlineAsmRegClass::reg) => None, InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg) | InlineAsmRegClass::Arm(ArmInlineAsmRegClass::sreg_low16) => None, diff --git a/compiler/rustc_codegen_llvm/src/asm.rs b/compiler/rustc_codegen_llvm/src/asm.rs index 8cd4bdc372789..273f7a585bf4c 100644 --- a/compiler/rustc_codegen_llvm/src/asm.rs +++ b/compiler/rustc_codegen_llvm/src/asm.rs @@ -229,6 +229,7 @@ impl<'ll, 'tcx> AsmBuilderMethods<'tcx> for Builder<'_, 'll, 'tcx> { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC | InlineAsmArch::Arm => { constraints.push("~{cc}".to_string()); } + InlineAsmArch::Amdgpu => {} InlineAsmArch::X86 | InlineAsmArch::X86_64 => { constraints.extend_from_slice(&[ "~{dirflag}".to_string(), @@ -645,6 +646,7 @@ fn reg_to_llvm(reg: InlineAsmRegOrRegClass, layout: Option<&TyAndLayout<'_>>) -> | Arm(ArmInlineAsmRegClass::dreg_low8) | Arm(ArmInlineAsmRegClass::qreg_low4) => "x", Arm(ArmInlineAsmRegClass::dreg) | Arm(ArmInlineAsmRegClass::qreg) => "w", + Amdgpu(class) => class.prefix(), Hexagon(HexagonInlineAsmRegClass::reg) => "r", Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => "r", @@ -745,6 +747,7 @@ fn modifier_to_llvm( modifier } } + Amdgpu(_) => None, Hexagon(_) => None, LoongArch(_) => None, Mips(_) => None, @@ -825,6 +828,7 @@ fn dummy_output_type<'ll>(cx: &CodegenCx<'ll, '_>, reg: InlineAsmRegClass) -> &' Arm(ArmInlineAsmRegClass::qreg) | Arm(ArmInlineAsmRegClass::qreg_low8) | Arm(ArmInlineAsmRegClass::qreg_low4) => cx.type_vector(cx.type_i64(), 2), + Amdgpu(_) => cx.type_i32(), Hexagon(HexagonInlineAsmRegClass::reg) => cx.type_i32(), Hexagon(HexagonInlineAsmRegClass::preg) => unreachable!("clobber-only"), LoongArch(LoongArchInlineAsmRegClass::reg) => cx.type_i32(), diff --git a/compiler/rustc_span/src/symbol.rs b/compiler/rustc_span/src/symbol.rs index f2b13dad1fd90..800474d21ec20 100644 --- a/compiler/rustc_span/src/symbol.rs +++ b/compiler/rustc_span/src/symbol.rs @@ -2028,6 +2028,7 @@ symbols! { self_struct_ctor, semiopaque, semitransparent, + sgpr, sha2, sha3, sha512_sm_x86, @@ -2448,6 +2449,7 @@ symbols! { verbatim, version, vfp2, + vgpr, vis, visible_private_types, volatile, diff --git a/compiler/rustc_target/src/asm/amdgpu.rs b/compiler/rustc_target/src/asm/amdgpu.rs new file mode 100644 index 0000000000000..06ae68a46da8e --- /dev/null +++ b/compiler/rustc_target/src/asm/amdgpu.rs @@ -0,0 +1,234 @@ +use std::fmt; + +use rustc_span::Symbol; + +use super::{InlineAsmArch, InlineAsmType, ModifierInfo}; + +def_reg_class! { + Amdgpu AmdgpuInlineAsmRegClass { + sgpr, + vgpr, + } +} + +// See https://llvm.org/docs/AMDGPUOperandSyntax.html +impl AmdgpuInlineAsmRegClass { + pub fn valid_modifiers(self, _arch: InlineAsmArch) -> &'static [char] { + &[] + } + + pub fn suggest_class(self, _arch: InlineAsmArch, _ty: InlineAsmType) -> Option { + None + } + + pub fn suggest_modifier( + self, + _arch: InlineAsmArch, + _ty: InlineAsmType, + ) -> Option { + None + } + + pub fn default_modifier(self, _arch: InlineAsmArch) -> Option { + None + } + + pub fn supported_types( + self, + _arch: InlineAsmArch, + ) -> &'static [(InlineAsmType, Option)] { + types! { _: I16, F16, I32, F32, I64, F64, I128; } + } + + /// The number of supported registers in this class. + /// The returned number is the length, so supported register + /// indices are 0 to max_num()-1. + fn max_num(self) -> u32 { + match self { + Self::sgpr => 106, + Self::vgpr => 256, + } + } + + /// Prefix when printed and register constraint in LLVM. + pub fn prefix(self) -> &'static str { + match self { + Self::sgpr => "s", + Self::vgpr => "v", + } + } + + /// Get register class from prefix. + fn parse_prefix(prefix: char) -> Result { + match prefix { + 's' => Ok(Self::sgpr), + 'v' => Ok(Self::vgpr), + _ => Err("unknown register prefix"), + } + } +} + +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +enum AmdgpuRegRange { + /// Low 16-bit of a register + Low(u32), + /// High 16-bit of a register + High(u32), + /// One or more 32-bit registers, in the inclusive range + Range { start: u32, end: u32 }, +} + +#[derive( + Copy, + Clone, + rustc_macros::Encodable, + rustc_macros::Decodable, + Debug, + Eq, + PartialEq, + PartialOrd, + Hash, + rustc_macros::HashStable_Generic +)] +#[allow(non_camel_case_types)] +pub struct AmdgpuInlineAsmReg { + class: AmdgpuInlineAsmRegClass, + range: AmdgpuRegRange, +} + +impl AmdgpuInlineAsmReg { + pub fn name(self) -> String { + let c = self.class.prefix(); + match self.range { + AmdgpuRegRange::Low(n) => format!("{c}{n}.l"), + AmdgpuRegRange::High(n) => format!("{c}{n}.h"), + AmdgpuRegRange::Range { start, end } if start == end => format!("{c}{start}"), + AmdgpuRegRange::Range { start, end } => format!("{c}[{start}:{end}]"), + } + } + + pub fn reg_class(self) -> AmdgpuInlineAsmRegClass { + self.class + } + + pub fn parse(name: &str) -> Result { + if name.is_empty() { + return Err("invalid empty register"); + } + let class = AmdgpuInlineAsmRegClass::parse_prefix(name.chars().next().unwrap())?; + // Form with range, e.g. s[2:3] + let res; + if name[1..].starts_with('[') { + if !name.ends_with(']') { + return Err("invalid register, missing closing bracket"); + } + if let Some((start, end)) = name[2..name.len() - 1].split_once(':') { + let Ok(start) = start.parse() else { + return Err("invalid register range start"); + }; + let Ok(end) = end.parse() else { + return Err("invalid register range end"); + }; + + // Check range + if start > end { + return Err("invalid reversed register range"); + } + + if end >= class.max_num() { + return Err("too large register for this class"); + } + res = Self { class, range: AmdgpuRegRange::Range { start, end } }; + } else { + return Err("invalid register range"); + } + } else { + let parse_num = |core: &str| { + let Ok(start) = core.parse() else { + return Err("invalid register number"); + }; + + if start >= class.max_num() { + return Err("too large register for this class"); + } + + Ok(start) + }; + + let name = &name[1..]; + let range = if let Some(name) = name.strip_suffix(".l") { + AmdgpuRegRange::Low(parse_num(name)?) + } else if let Some(name) = name.strip_suffix(".h") { + AmdgpuRegRange::High(parse_num(name)?) + } else { + let start = parse_num(name)?; + AmdgpuRegRange::Range { start, end: start } + }; + res = Self { class, range }; + } + Ok(res) + } + + pub fn validate( + self, + _arch: super::InlineAsmArch, + _reloc_model: crate::spec::RelocModel, + _target_features: &rustc_data_structures::fx::FxIndexSet, + _target: &crate::spec::Target, + _is_clobber: bool, + ) -> Result<(), &'static str> { + Ok(()) + } +} + +pub(super) fn fill_reg_map( + _arch: super::InlineAsmArch, + _reloc_model: crate::spec::RelocModel, + _target_features: &rustc_data_structures::fx::FxIndexSet, + _target: &crate::spec::Target, + map: &mut rustc_data_structures::fx::FxHashMap< + super::InlineAsmRegClass, + rustc_data_structures::fx::FxIndexSet, + >, +) { + use super::{InlineAsmReg, InlineAsmRegClass}; + + // Add single registers of each class (no register ranges) + #[allow(rustc::potential_query_instability)] + for class in regclass_map().keys() { + let InlineAsmRegClass::Amdgpu(class) = *class else { unreachable!("Must be amdgpu class") }; + if let Some(set) = map.get_mut(&InlineAsmRegClass::Amdgpu(class)) { + for i in 0..class.max_num() { + set.insert(InlineAsmReg::Amdgpu(AmdgpuInlineAsmReg { + class, + range: AmdgpuRegRange::Range { start: i, end: i }, + })); + } + } + } +} + +impl AmdgpuInlineAsmReg { + pub fn emit( + self, + out: &mut dyn fmt::Write, + _arch: InlineAsmArch, + _modifier: Option, + ) -> fmt::Result { + out.write_str(&self.name()) + } + + // There are too many conflicts to list + pub fn overlapping_regs(self, mut _cb: impl FnMut(AmdgpuInlineAsmReg)) {} +} diff --git a/compiler/rustc_target/src/asm/mod.rs b/compiler/rustc_target/src/asm/mod.rs index 57d9cdad454ac..cab800f553c75 100644 --- a/compiler/rustc_target/src/asm/mod.rs +++ b/compiler/rustc_target/src/asm/mod.rs @@ -1,3 +1,4 @@ +use std::borrow::Cow; use std::fmt; use rustc_abi::Size; @@ -177,6 +178,7 @@ macro_rules! types { } mod aarch64; +mod amdgpu; mod arm; mod avr; mod bpf; @@ -196,6 +198,7 @@ mod wasm; mod x86; pub use aarch64::{AArch64InlineAsmReg, AArch64InlineAsmRegClass}; +pub use amdgpu::{AmdgpuInlineAsmReg, AmdgpuInlineAsmRegClass}; pub use arm::{ArmInlineAsmReg, ArmInlineAsmRegClass}; pub use avr::{AvrInlineAsmReg, AvrInlineAsmRegClass}; pub use bpf::{BpfInlineAsmReg, BpfInlineAsmRegClass}; @@ -224,6 +227,7 @@ pub enum InlineAsmArch { RiscV32, RiscV64, Nvptx64, + Amdgpu, Hexagon, LoongArch32, LoongArch64, @@ -252,6 +256,7 @@ impl InlineAsmArch { Arch::Arm => Some(Self::Arm), Arch::Arm64EC => Some(Self::Arm64EC), Arch::AArch64 => Some(Self::AArch64), + Arch::AmdGpu => Some(Self::Amdgpu), Arch::RiscV32 => Some(Self::RiscV32), Arch::RiscV64 => Some(Self::RiscV64), Arch::Nvptx64 => Some(Self::Nvptx64), @@ -273,7 +278,7 @@ impl InlineAsmArch { Arch::Msp430 => Some(Self::Msp430), Arch::M68k => Some(Self::M68k), Arch::CSky => Some(Self::CSKY), - Arch::AmdGpu | Arch::Xtensa | Arch::Other(_) => None, + Arch::Xtensa | Arch::Other(_) => None, } } } @@ -283,6 +288,7 @@ impl InlineAsmArch { pub enum InlineAsmReg { X86(X86InlineAsmReg), Arm(ArmInlineAsmReg), + Amdgpu(AmdgpuInlineAsmReg), AArch64(AArch64InlineAsmReg), RiscV(RiscVInlineAsmReg), Nvptx(NvptxInlineAsmReg), @@ -304,24 +310,25 @@ pub enum InlineAsmReg { } impl InlineAsmReg { - pub fn name(self) -> &'static str { + pub fn name(self) -> Cow<'static, str> { match self { - Self::X86(r) => r.name(), - Self::Arm(r) => r.name(), - Self::AArch64(r) => r.name(), - Self::RiscV(r) => r.name(), - Self::PowerPC(r) => r.name(), - Self::Hexagon(r) => r.name(), - Self::LoongArch(r) => r.name(), - Self::Mips(r) => r.name(), - Self::S390x(r) => r.name(), - Self::Sparc(r) => r.name(), - Self::Bpf(r) => r.name(), - Self::Avr(r) => r.name(), - Self::Msp430(r) => r.name(), - Self::M68k(r) => r.name(), - Self::CSKY(r) => r.name(), - Self::Err => "", + Self::X86(r) => r.name().into(), + Self::Arm(r) => r.name().into(), + Self::AArch64(r) => r.name().into(), + Self::Amdgpu(r) => r.name().into(), + Self::RiscV(r) => r.name().into(), + Self::PowerPC(r) => r.name().into(), + Self::Hexagon(r) => r.name().into(), + Self::LoongArch(r) => r.name().into(), + Self::Mips(r) => r.name().into(), + Self::S390x(r) => r.name().into(), + Self::Sparc(r) => r.name().into(), + Self::Bpf(r) => r.name().into(), + Self::Avr(r) => r.name().into(), + Self::Msp430(r) => r.name().into(), + Self::M68k(r) => r.name().into(), + Self::CSKY(r) => r.name().into(), + Self::Err => "".into(), } } @@ -330,6 +337,7 @@ impl InlineAsmReg { Self::X86(r) => InlineAsmRegClass::X86(r.reg_class()), Self::Arm(r) => InlineAsmRegClass::Arm(r.reg_class()), Self::AArch64(r) => InlineAsmRegClass::AArch64(r.reg_class()), + Self::Amdgpu(r) => InlineAsmRegClass::Amdgpu(r.reg_class()), Self::RiscV(r) => InlineAsmRegClass::RiscV(r.reg_class()), Self::PowerPC(r) => InlineAsmRegClass::PowerPC(r.reg_class()), Self::Hexagon(r) => InlineAsmRegClass::Hexagon(r.reg_class()), @@ -356,6 +364,7 @@ impl InlineAsmReg { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC => { Self::AArch64(AArch64InlineAsmReg::parse(name)?) } + InlineAsmArch::Amdgpu => Self::Amdgpu(AmdgpuInlineAsmReg::parse(name)?), InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { Self::RiscV(RiscVInlineAsmReg::parse(name)?) } @@ -398,6 +407,7 @@ impl InlineAsmReg { Self::X86(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::Arm(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::AArch64(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), + Self::Amdgpu(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::RiscV(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::PowerPC(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), Self::Hexagon(r) => r.validate(arch, reloc_model, target_features, target, is_clobber), @@ -428,6 +438,7 @@ impl InlineAsmReg { Self::X86(r) => r.emit(out, arch, modifier), Self::Arm(r) => r.emit(out, arch, modifier), Self::AArch64(r) => r.emit(out, arch, modifier), + Self::Amdgpu(r) => r.emit(out, arch, modifier), Self::RiscV(r) => r.emit(out, arch, modifier), Self::PowerPC(r) => r.emit(out, arch, modifier), Self::Hexagon(r) => r.emit(out, arch, modifier), @@ -449,6 +460,7 @@ impl InlineAsmReg { Self::X86(r) => r.overlapping_regs(|r| cb(Self::X86(r))), Self::Arm(r) => r.overlapping_regs(|r| cb(Self::Arm(r))), Self::AArch64(_) => cb(self), + Self::Amdgpu(_) => cb(self), Self::RiscV(_) => cb(self), Self::PowerPC(r) => r.overlapping_regs(|r| cb(Self::PowerPC(r))), Self::Hexagon(r) => r.overlapping_regs(|r| cb(Self::Hexagon(r))), @@ -472,6 +484,7 @@ pub enum InlineAsmRegClass { X86(X86InlineAsmRegClass), Arm(ArmInlineAsmRegClass), AArch64(AArch64InlineAsmRegClass), + Amdgpu(AmdgpuInlineAsmRegClass), RiscV(RiscVInlineAsmRegClass), Nvptx(NvptxInlineAsmRegClass), PowerPC(PowerPCInlineAsmRegClass), @@ -497,6 +510,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.name(), Self::Arm(r) => r.name(), Self::AArch64(r) => r.name(), + Self::Amdgpu(r) => r.name(), Self::RiscV(r) => r.name(), Self::Nvptx(r) => r.name(), Self::PowerPC(r) => r.name(), @@ -524,6 +538,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::X86), Self::Arm(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Arm), Self::AArch64(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::AArch64), + Self::Amdgpu(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Amdgpu), Self::RiscV(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::RiscV), Self::Nvptx(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::Nvptx), Self::PowerPC(r) => r.suggest_class(arch, ty).map(InlineAsmRegClass::PowerPC), @@ -554,6 +569,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.suggest_modifier(arch, ty), Self::Arm(r) => r.suggest_modifier(arch, ty), Self::AArch64(r) => r.suggest_modifier(arch, ty), + Self::Amdgpu(r) => r.suggest_modifier(arch, ty), Self::RiscV(r) => r.suggest_modifier(arch, ty), Self::Nvptx(r) => r.suggest_modifier(arch, ty), Self::PowerPC(r) => r.suggest_modifier(arch, ty), @@ -584,6 +600,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.default_modifier(arch), Self::Arm(r) => r.default_modifier(arch), Self::AArch64(r) => r.default_modifier(arch), + Self::Amdgpu(r) => r.default_modifier(arch), Self::RiscV(r) => r.default_modifier(arch), Self::Nvptx(r) => r.default_modifier(arch), Self::PowerPC(r) => r.default_modifier(arch), @@ -617,6 +634,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.supported_types(arch), Self::Arm(r) => r.supported_types(arch), Self::AArch64(r) => r.supported_types(arch), + Self::Amdgpu(r) => r.supported_types(arch), Self::RiscV(r) => r.supported_types(arch), Self::Nvptx(r) => r.supported_types(arch), Self::PowerPC(r) => r.supported_types(arch), @@ -645,6 +663,7 @@ impl InlineAsmRegClass { InlineAsmArch::AArch64 | InlineAsmArch::Arm64EC => { Self::AArch64(AArch64InlineAsmRegClass::parse(name)?) } + InlineAsmArch::Amdgpu => Self::Amdgpu(AmdgpuInlineAsmRegClass::parse(name)?), InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { Self::RiscV(RiscVInlineAsmRegClass::parse(name)?) } @@ -682,6 +701,7 @@ impl InlineAsmRegClass { Self::X86(r) => r.valid_modifiers(arch), Self::Arm(r) => r.valid_modifiers(arch), Self::AArch64(r) => r.valid_modifiers(arch), + Self::Amdgpu(r) => r.valid_modifiers(arch), Self::RiscV(r) => r.valid_modifiers(arch), Self::Nvptx(r) => r.valid_modifiers(arch), Self::PowerPC(r) => r.valid_modifiers(arch), @@ -843,6 +863,11 @@ pub fn allocatable_registers( aarch64::fill_reg_map(arch, reloc_model, target_features, target, &mut map); map } + InlineAsmArch::Amdgpu => { + let mut map = amdgpu::regclass_map(); + amdgpu::fill_reg_map(arch, reloc_model, target_features, target, &mut map); + map + } InlineAsmArch::RiscV32 | InlineAsmArch::RiscV64 => { let mut map = riscv::regclass_map(); riscv::fill_reg_map(arch, reloc_model, target_features, target, &mut map); diff --git a/tests/assembly-llvm/asm/amdgpu-types.rs b/tests/assembly-llvm/asm/amdgpu-types.rs new file mode 100644 index 0000000000000..3728ae0cf7d09 --- /dev/null +++ b/tests/assembly-llvm/asm/amdgpu-types.rs @@ -0,0 +1,236 @@ +//@ add-minicore +//@ revisions: gfx11 gfx12 +//@ assembly-output: emit-asm +//@ compile-flags: --target amdgcn-amd-amdhsa +//@[gfx11] compile-flags: -Ctarget-cpu=gfx1100 +//@[gfx12] compile-flags: -Ctarget-cpu=gfx1200 +//@ needs-llvm-components: amdgpu +//@ needs-rust-lld + +#![feature(abi_gpu_kernel, no_core, asm_experimental_arch, f16)] +#![crate_type = "rlib"] +#![no_core] +#![allow(asm_sub_register, non_camel_case_types)] + +extern crate minicore; +use minicore::*; + +type ptr = *mut u8; + +macro_rules! check { + ($func:ident $ty:ident $class:ident $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) -> $ty { + let y; + asm!(concat!($mov, " {}, {}"), out($class) y, in($class) x); + y + } + }; + + ($func:ident $ret_ty:ident $ret_class:ident $($arg_name:ident: $arg_ty:ident $arg_class:ident,)* + $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) -> $ret_ty { + let result; + asm!(concat!($mov, " {}", $(", {", stringify!($arg_name), "}",)*), + out($ret_class) result, $($arg_name = in($arg_class) $arg_name,)*); + result + } + }; +} + +macro_rules! check_reg { + ($func:ident $ty:ident $reg:tt $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func(x: $ty) -> $ty { + let y; + asm!(concat!($mov, " ", $reg, ", ", $reg), lateout($reg) y, in($reg) x); + y + } + }; + + ($func:ident $ret_ty:ident $ret_reg:tt $($arg_name:ident: $arg_ty:ident $arg_reg:tt,)* + $mov:literal) => { + #[no_mangle] + pub unsafe extern "gpu-kernel" fn $func($($arg_name: $arg_ty,)*) -> $ret_ty { + let result; + asm!(concat!($mov, " ", $ret_reg, $(", ", $arg_reg,)*), lateout($ret_reg) result, + $(in($arg_reg) $arg_name,)*); + result + } + }; +} + +// CHECK-LABEL: sgpr_i16: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check!(sgpr_i16 i32 sgpr x: i16 sgpr, y: i16 sgpr, "s_pack_ll_b32_b16"); + +// gfx11-LABEL: vgpr_i16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_i16 i16 vgpr "v_mov_b16"); + +// gfx12-LABEL: sgpr_f16: +// gfx12: #ASMSTART +// gfx12: s_add_f16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check!(sgpr_f16 f16 sgpr x: f16 sgpr, y: f16 sgpr, "s_add_f16"); + +// gfx11-LABEL: vgpr_f16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check!(vgpr_f16 f16 vgpr "v_mov_b16"); + +// CHECK-LABEL: sgpr_i32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i32 i32 sgpr "s_mov_b32"); + +// CHECK-LABEL: vgpr_i32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check!(vgpr_i32 i32 vgpr "v_mov_b32"); + +// CHECK-LABEL: sgpr_f32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_f32 f32 sgpr "s_mov_b32"); + +// CHECK-LABEL: vgpr_f32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check!(vgpr_f32 f32 vgpr "v_mov_b32"); + +// CHECK-LABEL: sgpr_i64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(sgpr_i64 i64 sgpr "s_mov_b64"); + +// CHECK-LABEL: vgpr_i64: +// CHECK: #ASMSTART +// CHECK: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i64 i64 vgpr x: i32 vgpr, y: i64 vgpr, "v_lshlrev_b64"); + +// CHECK-LABEL: sgpr_f64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(sgpr_f64 f64 sgpr "s_mov_b64"); + +// CHECK-LABEL: vgpr_f64: +// CHECK: #ASMSTART +// CHECK: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_f64 f64 vgpr x: f64 vgpr, y: f64 vgpr, "v_add_f64"); + +// CHECK-LABEL: sgpr_i128: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check!(sgpr_i128 i128 sgpr x: ptr sgpr, y: i32 sgpr, "s_load_b128"); + +// CHECK-LABEL: vgpr_i128: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check!(vgpr_i128 i128 vgpr x: i32 vgpr, y: ptr sgpr, "global_load_b128"); + +// CHECK-LABEL: s0_i16: +// CHECK: #ASMSTART +// CHECK: s_pack_ll_b32_b16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// CHECK: #ASMEND +check_reg!(s0_i16 i32 "s0" x: i16 "s1", y: i16 "s2", "s_pack_ll_b32_b16"); + +// gfx11-LABEL: v0_i16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_i16 i16 "v0.l" "v_mov_b16"); + +// gfx12-LABEL: s0_f16: +// gfx12: #ASMSTART +// gfx12: s_add_f16 s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}}, s{{[a-z0-9.]+}} +// gfx12: #ASMEND +#[cfg(gfx12)] +check_reg!(s0_f16 f16 "s0" x: f16 "s1", y: f16 "s2", "s_add_f16"); + +// gfx11-LABEL: v0_f16: +// gfx11: #ASMSTART +// gfx11: v_mov_b16 v{{[a-z0-9.]+}}, v{{[a-z0-9.]+}} +// gfx11: #ASMEND +#[cfg(gfx11)] +check_reg!(v0_f16 f16 "v0.l" "v_mov_b16"); + +// CHECK-LABEL: s0_i32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i32 i32 "s0" "s_mov_b32"); + +// CHECK-LABEL: v0_i32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(v0_i32 i32 "v0" "v_mov_b32"); + +// CHECK-LABEL: s0_f32: +// CHECK: #ASMSTART +// CHECK: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_f32 f32 "s0" "s_mov_b32"); + +// CHECK-LABEL: v0_f32: +// CHECK: #ASMSTART +// CHECK: v_mov_b32 v{{[0-9]+}}, v{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(v0_f32 f32 "v0" "v_mov_b32"); + +// CHECK-LABEL: s0_i64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(s0_i64 i64 "s[0:1]" "s_mov_b64"); + +// CHECK-LABEL: v0_i64: +// CHECK: #ASMSTART +// CHECK: v_lshlrev_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i64 i64 "v[0:1]" x: i32 "v0", y: i64 "v[0:1]", "v_lshlrev_b64"); + +// CHECK-LABEL: s0_f64: +// CHECK: #ASMSTART +// CHECK: s_mov_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(s0_f64 f64 "s[0:1]" "s_mov_b64"); + +// CHECK-LABEL: v0_f64: +// CHECK: #ASMSTART +// CHECK: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_f64 f64 "v[0:1]" x: f64 "v[0:1]", y: f64 "v[2:3]", "v_add_f64"); + +// CHECK-LABEL: s0_i128: +// CHECK: #ASMSTART +// CHECK: s_load_b128 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} +// CHECK: #ASMEND +check_reg!(s0_i128 i128 "s[0:3]" x: ptr "s[0:1]", y: i32 "s0", "s_load_b128"); + +// CHECK-LABEL: v0_i128: +// CHECK: #ASMSTART +// CHECK: global_load_b128 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} +// CHECK: #ASMEND +check_reg!(v0_i128 i128 "v[0:3]" x: i32 "v0", y: ptr "s[0:1]", "global_load_b128");