diff --git a/Cargo.lock b/Cargo.lock index 95124ac6fe..d3123caaa4 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -121,6 +121,26 @@ dependencies = [ "serde", ] +[[package]] +name = "capstone" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "015ef5d5ca1743e3f94af9509ba6bd2886523cfee46e48d15c2ef5216fd4ac9a" +dependencies = [ + "capstone-sys", + "libc", +] + +[[package]] +name = "capstone-sys" +version = "0.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2267cb8d16a1e4197863ec4284ffd1aec26fe7e57c58af46b02590a0235809a0" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "cargo-platform" version = "0.1.9" @@ -591,6 +611,7 @@ version = "0.1.0" dependencies = [ "aes", "bitflags", + "capstone", "chrono", "chrono-tz", "colored", diff --git a/Cargo.toml b/Cargo.toml index 76018efc4a..0b4b8e26bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,7 @@ libloading = "0.8" nix = { version = "0.30.1", features = ["mman", "ptrace", "signal"] } ipc-channel = "0.19.0" serde = { version = "1.0.219", features = ["derive"] } +capstone = "0.13" [dev-dependencies] ui_test = "0.29.1" diff --git a/src/alloc/isolated_alloc.rs b/src/alloc/isolated_alloc.rs index 123597ed88..1f5f7dd5df 100644 --- a/src/alloc/isolated_alloc.rs +++ b/src/alloc/isolated_alloc.rs @@ -1,5 +1,6 @@ use std::alloc::{self, Layout}; +use nix::sys::mman; use rustc_index::bit_set::DenseBitSet; /// How many bytes of memory each bit in the bitset represents. @@ -271,13 +272,54 @@ impl IsolatedAlloc { pub fn pages(&self) -> Vec { let mut pages: Vec<_> = self.page_ptrs.clone().into_iter().map(|p| p.expose_provenance()).collect(); - for (ptr, size) in &self.huge_ptrs { + self.huge_ptrs.iter().for_each(|(ptr, size)| { for i in 0..size / self.page_size { pages.push(ptr.expose_provenance().strict_add(i * self.page_size)); } - } + }); pages } + + /// Protects all owned memory as `PROT_NONE`, preventing accesses. + /// + /// SAFETY: Accessing memory after this point will result in a segfault + /// unless it is first unprotected. + pub unsafe fn prepare_ffi(&mut self) -> Result<(), nix::errno::Errno> { + let prot = mman::ProtFlags::PROT_NONE; + unsafe { self.mprotect(prot) } + } + + /// Deprotects all owned memory by setting it to RW. Erroring here is very + /// likely unrecoverable, so it may panic if applying those permissions + /// fails. + pub fn unprep_ffi(&mut self) { + let prot = mman::ProtFlags::PROT_READ | mman::ProtFlags::PROT_WRITE; + unsafe { + self.mprotect(prot).unwrap(); + } + } + + /// Applies `prot` to every page managed by the allocator. + /// + /// SAFETY: Accessing memory in violation of the protection flags will + /// trigger a segfault. + unsafe fn mprotect(&mut self, prot: mman::ProtFlags) -> Result<(), nix::errno::Errno> { + for &pg in &self.page_ptrs { + unsafe { + // We already know only non-null ptrs are pushed to self.pages + let addr: std::ptr::NonNull = + std::ptr::NonNull::new_unchecked(pg.cast()); + mman::mprotect(addr, self.page_size, prot)?; + } + } + for &(hpg, size) in &self.huge_ptrs { + unsafe { + let addr = std::ptr::NonNull::new_unchecked(hpg.cast()); + mman::mprotect(addr, size.next_multiple_of(self.page_size), prot)?; + } + } + Ok(()) + } } #[cfg(test)] diff --git a/src/alloc_addresses/mod.rs b/src/alloc_addresses/mod.rs index 4a038fe648..2abb2d91ab 100644 --- a/src/alloc_addresses/mod.rs +++ b/src/alloc_addresses/mod.rs @@ -470,13 +470,46 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { /// This overapproximates the modifications which external code might make to memory: /// We set all reachable allocations as initialized, mark all reachable provenances as exposed /// and overwrite them with `Provenance::WILDCARD`. - fn prepare_exposed_for_native_call(&mut self) -> InterpResult<'tcx> { + fn prepare_exposed_for_native_call(&mut self, _paranoid: bool) -> InterpResult<'tcx> { let this = self.eval_context_mut(); // We need to make a deep copy of this list, but it's fine; it also serves as scratch space // for the search within `prepare_for_native_call`. let exposed: Vec = this.machine.alloc_addresses.get_mut().exposed.iter().copied().collect(); - this.prepare_for_native_call(exposed) + this.prepare_for_native_call(exposed /*, paranoid*/) + } + + /// Makes use of information obtained about memory accesses during FFI to determine which + /// provenances should be exposed. Note that if `prepare_exposed_for_native_call` was not + /// called before the FFI (with `paranoid` set to false) then some of the writes may be + /// lost! + #[cfg(target_os = "linux")] + fn apply_events( + &mut self, + events: crate::shims::trace::messages::MemEvents, + ) -> InterpResult<'tcx> { + let this = self.eval_context_mut(); + + let mut reads = vec![]; + let mut writes = vec![]; + for acc in events.acc_events { + match acc { + // Ideally, we'd skip reads that occur after certain bytes were + // already written to. However, these are always just conservative + // overestimates - Read(range) means "a read maybe happened + // spanning at most range" - so we can't make use of this for + // now. Maybe we could also skip over reads/writes that hit the + // same bytes, but that's best added together with the stuff above. + shims::trace::AccessEvent::Read(range) => reads.push(range), + shims::trace::AccessEvent::Write(range) => { + writes.push(range); + } + } + } + let _exposed: Vec = + this.machine.alloc_addresses.get_mut().exposed.iter().copied().collect(); + interp_ok(()) + //this.apply_accesses(exposed, events.reads, events.writes) } } diff --git a/src/shims/native_lib.rs b/src/shims/native_lib.rs index 53d1de060f..9d238146d8 100644 --- a/src/shims/native_lib.rs +++ b/src/shims/native_lib.rs @@ -219,8 +219,15 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { } } - // Prepare all exposed memory. - this.prepare_exposed_for_native_call()?; + // Prepare all exposed memory, depending on whether we have a supervisor process. + #[cfg(target_os = "linux")] + if super::trace::Supervisor::poll() { + this.prepare_exposed_for_native_call(false)?; + } else { + this.prepare_exposed_for_native_call(true)?; + } + #[cfg(not(target_os = "linux"))] + this.prepare_exposed_for_native_call(true)?; // Convert them to `libffi::high::Arg` type. let libffi_args = libffi_args @@ -229,7 +236,15 @@ pub trait EvalContextExt<'tcx>: crate::MiriInterpCxExt<'tcx> { .collect::>>(); // Call the function and store output, depending on return type in the function signature. - let (ret, _) = this.call_native_with_args(link_name, dest, code_ptr, libffi_args)?; + let (ret, maybe_memevents) = + this.call_native_with_args(link_name, dest, code_ptr, libffi_args)?; + + #[cfg(target_os = "linux")] + if let Some(events) = maybe_memevents { + this.apply_events(events)?; + } + #[cfg(not(target_os = "linux"))] + let _ = maybe_memevents; // Suppress the unused warning. this.write_immediate(*ret, dest)?; interp_ok(true) @@ -250,15 +265,15 @@ unsafe fn do_native_call( unsafe { if let Some(alloc) = alloc { - // SAFETY: We don't touch the machine memory past this point + // SAFETY: We don't touch the machine memory past this point. let (guard, stack_ptr) = Supervisor::start_ffi(alloc.clone()); - // SAFETY: Upheld by caller + // SAFETY: Upheld by caller. let ret = ffi::call(ptr, args); // SAFETY: We got the guard and stack pointer from start_ffi, and - // the allocator is the same + // the allocator is the same. (ret, Supervisor::end_ffi(guard, alloc, stack_ptr)) } else { - // SAFETY: Upheld by caller + // SAFETY: Upheld by caller. (ffi::call(ptr, args), None) } } diff --git a/src/shims/trace/child.rs b/src/shims/trace/child.rs index dcfdaad748..d9227273a1 100644 --- a/src/shims/trace/child.rs +++ b/src/shims/trace/child.rs @@ -52,30 +52,40 @@ impl Supervisor { // If the supervisor is not initialised for whatever reason, fast-fail. // This might be desired behaviour, as even on platforms where ptracing // is not implemented it enables us to enforce that only one FFI call - // happens at a time + // happens at a time. let Some(sv) = sv_guard.take() else { return (sv_guard, None); }; // Get pointers to all the pages the supervisor must allow accesses in - // and prepare the fake stack + // and prepare the fake stack. let page_ptrs = alloc.borrow().pages(); let raw_stack_ptr: *mut [u8; FAKE_STACK_SIZE] = Box::leak(Box::new([0u8; FAKE_STACK_SIZE])).as_mut_ptr().cast(); let stack_ptr = raw_stack_ptr.expose_provenance(); let start_info = StartFfiInfo { page_ptrs, stack_ptr }; + // SAFETY: We do not access machine memory past this point until the + // supervisor is ready to allow it. + unsafe { + if alloc.borrow_mut().prepare_ffi().is_err() { + // Don't mess up unwinding by maybe leaving the memory partly protected + alloc.borrow_mut().unprep_ffi(); + panic!("Cannot protect memory for FFI call!"); + } + } + // Send over the info. // NB: if we do not wait to receive a blank confirmation response, it is // possible that the supervisor is alerted of the SIGSTOP *before* it has // actually received the start_info, thus deadlocking! This way, we can - // enforce an ordering for these events + // enforce an ordering for these events. sv.message_tx.send(TraceRequest::StartFfi(start_info)).unwrap(); sv.confirm_rx.recv().unwrap(); *sv_guard = Some(sv); // We need to be stopped for the supervisor to be able to make certain // modifications to our memory - simply waiting on the recv() doesn't - // count + // count. signal::raise(signal::SIGSTOP).unwrap(); (sv_guard, Some(raw_stack_ptr)) } @@ -90,7 +100,7 @@ impl Supervisor { /// one passed to it also. pub unsafe fn end_ffi( mut sv_guard: std::sync::MutexGuard<'static, Option>, - _alloc: Rc>, + alloc: Rc>, raw_stack_ptr: Option<*mut [u8; FAKE_STACK_SIZE]>, ) -> Option { // We can't use IPC channels here to signal that FFI mode has ended, @@ -99,19 +109,22 @@ impl Supervisor { // simpler and more robust to simply use the signals which are left for // arbitrary usage. Since this will block until we are continued by the // supervisor, we can assume past this point that everything is back to - // normal + // normal. signal::raise(signal::SIGUSR1).unwrap(); + // This is safe! It just sets memory to normal expected permissions. + alloc.borrow_mut().unprep_ffi(); + // If this is `None`, then `raw_stack_ptr` is None and does not need to // be deallocated (and there's no need to worry about the guard, since - // it contains nothing) + // it contains nothing). let sv = sv_guard.take()?; // SAFETY: Caller upholds that this pointer was allocated as a box with - // this type + // this type. unsafe { drop(Box::from_raw(raw_stack_ptr.unwrap())); } - // On the off-chance something really weird happens, don't block forever + // On the off-chance something really weird happens, don't block forever. let ret = sv .event_rx .try_recv_timeout(std::time::Duration::from_secs(5)) @@ -138,71 +151,74 @@ impl Supervisor { /// The invariants for `fork()` must be upheld by the caller. pub unsafe fn init_sv() -> Result<(), SvInitError> { // FIXME: Much of this could be reimplemented via the mitosis crate if we upstream the - // relevant missing bits + // relevant missing bits. // On Linux, this will check whether ptrace is fully disabled by the Yama module. // If Yama isn't running or we're not on Linux, we'll still error later, but - // this saves a very expensive fork call + // this saves a very expensive fork call. let ptrace_status = std::fs::read_to_string("/proc/sys/kernel/yama/ptrace_scope"); if let Ok(stat) = ptrace_status { if let Some(stat) = stat.chars().next() { - // Fast-error if ptrace is fully disabled on the system + // Fast-error if ptrace is fully disabled on the system. if stat == '3' { return Err(SvInitError); } } } - // Initialise the supervisor if it isn't already, placing it into SUPERVISOR + // Initialise the supervisor if it isn't already, placing it into SUPERVISOR. let mut lock = SUPERVISOR.lock().unwrap(); if lock.is_some() { return Ok(()); } - // Prepare the IPC channels we need + // Prepare the IPC channels we need. let (message_tx, message_rx) = ipc::channel().unwrap(); let (confirm_tx, confirm_rx) = ipc::channel().unwrap(); let (event_tx, event_rx) = ipc::channel().unwrap(); - // SAFETY: Calling sysconf(_SC_PAGESIZE) is always safe and cannot error + // SAFETY: Calling sysconf(_SC_PAGESIZE) is always safe and cannot error. let page_size = unsafe { libc::sysconf(libc::_SC_PAGESIZE) }.try_into().unwrap(); + // Set up the pagesize used in the memory protection functions. + // SAFETY: sysconf(_SC_PAGESIZE) is always safe and doesn't error + super::parent::PAGE_SIZE.store(page_size, std::sync::atomic::Ordering::Relaxed); + unsafe { // TODO: Maybe use clone3() instead for better signalling of when the child exits? // SAFETY: Caller upholds that only one thread exists. match unistd::fork().unwrap() { unistd::ForkResult::Parent { child } => { // If somehow another thread does exist, prevent it from accessing the lock - // and thus breaking our safety invariants + // and thus breaking our safety invariants. std::mem::forget(lock); // The child process is free to unwind, so we won't to avoid doubly freeing - // system resources + // system resources. let init = std::panic::catch_unwind(|| { let listener = ChildListener { message_rx, attached: false, override_retcode: None }; - // Trace as many things as possible, to be able to handle them as needed + // Trace as many things as possible, to be able to handle them as needed. let options = ptrace::Options::PTRACE_O_TRACESYSGOOD | ptrace::Options::PTRACE_O_TRACECLONE | ptrace::Options::PTRACE_O_TRACEFORK; - // Attach to the child process without stopping it + // Attach to the child process without stopping it. match ptrace::seize(child, options) { // Ptrace works :D Ok(_) => { - let code = sv_loop(listener, child, event_tx, confirm_tx, page_size) - .unwrap_err(); + let code = sv_loop(listener, child, event_tx, confirm_tx).unwrap_err(); // If a return code of 0 is not explicitly given, assume something went - // wrong and return 1 + // wrong and return 1. std::process::exit(code.unwrap_or(1)) } - // Ptrace does not work and we failed to catch that + // Ptrace does not work and we failed to catch that. Err(_) => { - // If we can't ptrace, Miri continues being the parent + // If we can't ptrace, Miri continues being the parent. signal::kill(child, signal::SIGKILL).unwrap(); SvInitError } } }); match init { - // The "Ok" case means that we couldn't ptrace + // The "Ok" case means that we couldn't ptrace. Ok(e) => return Err(e), Err(p) => { eprintln!("Supervisor process panicked!\n{p:?}"); @@ -212,12 +228,12 @@ pub unsafe fn init_sv() -> Result<(), SvInitError> { } unistd::ForkResult::Child => { // Make sure we never get orphaned and stuck in SIGSTOP or similar - // SAFETY: prctl PR_SET_PDEATHSIG is always safe to call + // SAFETY: prctl PR_SET_PDEATHSIG is always safe to call. let ret = libc::prctl(libc::PR_SET_PDEATHSIG, libc::SIGTERM); assert_eq!(ret, 0); // First make sure the parent succeeded with ptracing us! signal::raise(signal::SIGSTOP).unwrap(); - // If we're the child process, save the supervisor info + // If we're the child process, save the supervisor info. *lock = Some(Supervisor { message_tx, confirm_rx, event_rx }); } } diff --git a/src/shims/trace/parent.rs b/src/shims/trace/parent.rs index 5eb8aa8465..fd2c7c037d 100644 --- a/src/shims/trace/parent.rs +++ b/src/shims/trace/parent.rs @@ -1,16 +1,108 @@ +use std::sync::atomic::{AtomicPtr, AtomicUsize}; + use ipc_channel::ipc; use nix::sys::{ptrace, signal, wait}; use nix::unistd; -use super::StartFfiInfo; -use super::messages::{Confirmation, MemEvents, TraceRequest}; +use crate::shims::trace::messages::{Confirmation, MemEvents, TraceRequest}; +use crate::shims::trace::{AccessEvent, FAKE_STACK_SIZE, StartFfiInfo}; /// The flags to use when calling `waitid()`. -/// Since bitwise OR on the nix version of these flags is implemented as a trait, -/// we can't use them directly so we do it this way +/// Since bitwise or on the nix version of these flags is implemented as a trait, +/// this cannot be const directly so we do it this way. const WAIT_FLAGS: wait::WaitPidFlag = wait::WaitPidFlag::from_bits_truncate(libc::WUNTRACED | libc::WEXITED); +/// Arch-specific maximum size a single access might perform. x86 value is set +/// assuming nothing bigger than AVX-512 is available. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +const ARCH_MAX_ACCESS_SIZE: usize = 64; +/// The largest arm64 simd instruction operates on 16 bytes. +#[cfg(any(target_arch = "arm", target_arch = "aarch64"))] +const ARCH_MAX_ACCESS_SIZE: usize = 16; +/// The max riscv vector instruction can access 8 consecutive 32-bit values. +#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +const ARCH_MAX_ACCESS_SIZE: usize = 32; + +/// The default word size on a given platform, in bytes. +#[cfg(any(target_arch = "x86", target_arch = "arm", target_arch = "riscv32"))] +const ARCH_WORD_SIZE: usize = 4; +#[cfg(any(target_arch = "x86_64", target_arch = "aarch64", target_arch = "riscv64"))] +const ARCH_WORD_SIZE: usize = 8; + +/// The address of the page set to be edited, initialised to a sentinel null +/// pointer. +static PAGE_ADDR: AtomicPtr = AtomicPtr::new(std::ptr::null_mut()); +/// The host pagesize, initialised to a sentinel zero value. +pub static PAGE_SIZE: AtomicUsize = AtomicUsize::new(0); +/// How many consecutive pages to unprotect. 1 by default, unlikely to be set +/// higher than 2. +static PAGE_COUNT: AtomicUsize = AtomicUsize::new(1); + +/// Allows us to get common arguments from the `user_regs_t` across architectures. +/// Normally this would land us ABI hell, but thankfully all of our usecases +/// consist of functions with a small number of register-sized integer arguments. +/// See for sources. +trait ArchIndependentRegs { + /// Gets the address of the instruction pointer. + fn ip(&self) -> usize; + /// Set the instruction pointer; remember to also set the stack pointer, or + /// else the stack might get messed up! + fn set_ip(&mut self, ip: usize); + /// Set the stack pointer, ideally to a zeroed-out area. + fn set_sp(&mut self, sp: usize); +} + +// It's fine / desirable behaviour for values to wrap here, we care about just +// preserving the bit pattern. +#[cfg(target_arch = "x86_64")] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.rip as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.rip = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.rsp = sp as _ } +} + +#[cfg(target_arch = "x86")] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.eip as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.eip = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.esp = sp as _ } +} + +#[cfg(target_arch = "aarch64")] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.pc as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.pc = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.sp = sp as _ } +} + +#[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] +#[expect(clippy::as_conversions)] +#[rustfmt::skip] +impl ArchIndependentRegs for libc::user_regs_struct { + #[inline] + fn ip(&self) -> usize { self.pc as _ } + #[inline] + fn set_ip(&mut self, ip: usize) { self.pc = ip as _ } + #[inline] + fn set_sp(&mut self, sp: usize) { self.sp = sp as _ } +} + /// A unified event representing something happening on the child process. Wraps /// `nix`'s `WaitStatus` and our custom signals so it can all be done with one /// `match` statement. @@ -22,7 +114,7 @@ pub enum ExecEvent { End, /// The child process with the specified pid was stopped by the given signal. Status(unistd::Pid, signal::Signal), - /// The child process with the specified pid entered or exited a syscall. + /// The child process with the specified pid entered or existed a syscall. Syscall(unistd::Pid), /// A child process exited or was killed; if we have a return code, it is /// specified. @@ -45,7 +137,7 @@ impl Iterator for ChildListener { // Allows us to monitor the child process by just iterating over the listener // NB: This should never return None! fn next(&mut self) -> Option { - // Do not block if the child has nothing to report for `waitid` + // Do not block if the child has nothing to report for `waitid`. let opts = WAIT_FLAGS | wait::WaitPidFlag::WNOHANG; loop { // Listen to any child, not just the main one. Important if we want @@ -55,17 +147,17 @@ impl Iterator for ChildListener { match wait::waitid(wait::Id::All, opts) { Ok(stat) => match stat { - // Child exited normally with a specific code set + // Child exited normally with a specific code set. wait::WaitStatus::Exited(_, code) => { let code = self.override_retcode.unwrap_or(code); return Some(ExecEvent::Died(Some(code))); } - // Child was killed by a signal, without giving a code + // Child was killed by a signal, without giving a code. wait::WaitStatus::Signaled(_, _, _) => return Some(ExecEvent::Died(self.override_retcode)), // Child entered a syscall. Since we're always technically // tracing, only pass this along if we're actively - // monitoring the child + // monitoring the child. wait::WaitStatus::PtraceSyscall(pid) => if self.attached { return Some(ExecEvent::Syscall(pid)); @@ -84,11 +176,11 @@ impl Iterator for ChildListener { return Some(ExecEvent::Status(pid, signal)); } } else { - // Just pass along the signal + // Just pass along the signal. ptrace::cont(pid, signal).unwrap(); }, // Child was stopped at the given signal. Same logic as for - // WaitStatus::PtraceEvent + // WaitStatus::PtraceEvent. wait::WaitStatus::Stopped(pid, signal) => if self.attached { if signal == signal::SIGUSR1 { @@ -104,11 +196,11 @@ impl Iterator for ChildListener { }, // This case should only trigger if all children died and we // somehow missed that, but it's best we not allow any room - // for deadlocks + // for deadlocks. Err(_) => return Some(ExecEvent::Died(None)), } - // Similarly, do a non-blocking poll of the IPC channel + // Similarly, do a non-blocking poll of the IPC channel. if let Ok(req) = self.message_rx.try_recv() { match req { TraceRequest::StartFfi(info) => @@ -123,7 +215,7 @@ impl Iterator for ChildListener { } } - // Not ideal, but doing anything else might sacrifice performance + // Not ideal, but doing anything else might sacrifice performance. std::thread::yield_now(); } } @@ -134,6 +226,8 @@ impl Iterator for ChildListener { enum ExecError { /// The child process died with this return code, if we have one. Died(Option), + /// Something errored, but we should ignore it and proceed. + Shrug, } /// This is the main loop of the supervisor process. It runs in a separate @@ -144,35 +238,41 @@ pub fn sv_loop( init_pid: unistd::Pid, event_tx: ipc::IpcSender, confirm_tx: ipc::IpcSender, - _page_size: usize, ) -> Result> { - // Things that we return to the child process + let page_size = PAGE_SIZE.load(std::sync::atomic::Ordering::Relaxed); + assert_ne!(page_size, 0); + + // Things that we return to the child process. let mut acc_events = Vec::new(); - // Memory allocated on the MiriMachine - let mut _ch_pages = Vec::new(); - let mut _ch_stack = None; + // Memory allocated for the MiriMachine. + let mut ch_pages = Vec::new(); + let mut ch_stack = None; + + // An instance of the Capstone disassembler, so we don't spawn one on every access. + let cs = get_disasm(); // The pid of the last process we interacted with, used by default if we don't have a - // reason to use a different one + // reason to use a different one. let mut curr_pid = init_pid; - // There's an initial sigstop we need to deal with + // There's an initial sigstop we need to deal with. wait_for_signal(Some(curr_pid), signal::SIGSTOP, false).map_err(|e| { match e { ExecError::Died(code) => code, + ExecError::Shrug => None, } })?; ptrace::cont(curr_pid, None).unwrap(); for evt in listener { match evt { - // start_ffi was called by the child, so prep memory + // start_ffi was called by the child, so prep memory. ExecEvent::Start(ch_info) => { - // All the pages that the child process is "allowed to" access - _ch_pages = ch_info.page_ptrs; - // And the fake stack it allocated for us to use later - _ch_stack = Some(ch_info.stack_ptr); + // All the pages that the child process is "allowed to" access. + ch_pages = ch_info.page_ptrs; + // And the fake stack it allocated for us to use later. + ch_stack = Some(ch_info.stack_ptr); // We received the signal and are no longer in the main listener loop, // so we can let the child move on to the end of start_ffi where it will @@ -180,34 +280,57 @@ pub fn sv_loop( // order to do most ptrace operations! confirm_tx.send(Confirmation).unwrap(); // We can't trust simply calling `Pid::this()` in the child process to give the right - // PID for us, so we get it this way + // PID for us, so we get it this way. curr_pid = wait_for_signal(None, signal::SIGSTOP, false).unwrap(); ptrace::syscall(curr_pid, None).unwrap(); } - // end_ffi was called by the child + // end_ffi was called by the child. ExecEvent::End => { - // Hand over the access info we traced + // Hand over the access info we traced. event_tx.send(MemEvents { acc_events }).unwrap(); - // And reset our values + // And reset our values. acc_events = Vec::new(); - _ch_stack = None; + ch_pages = Vec::new(); + ch_stack = None; - // No need to monitor syscalls anymore, they'd just be ignored + // No need to monitor syscalls anymore, they'd just be ignored. ptrace::cont(curr_pid, None).unwrap(); } // Child process was stopped by a signal - ExecEvent::Status(pid, signal) => { - eprintln!("Process unexpectedly got {signal}; continuing..."); - // In case we're not tracing - if ptrace::syscall(pid, signal).is_err() { - // If *this* fails too, something really weird happened - // and it's probably best to just panic - signal::kill(pid, signal::SIGCONT).unwrap(); - } - } + ExecEvent::Status(pid, signal) => + match signal { + // If it was a segfault, check if it was an artificial one + // caused by it trying to access the MiriMachine memory. + signal::SIGSEGV => + match handle_segfault( + pid, + &ch_pages, + ch_stack.unwrap(), + page_size, + &cs, + &mut acc_events, + ) { + Err(e) => + match e { + ExecError::Died(code) => return Err(code), + ExecError::Shrug => continue, + }, + _ => (), + }, + // Something weird happened. + _ => { + eprintln!("Process unexpectedly got {signal}; continuing..."); + // In case we're not tracing + if ptrace::syscall(pid, None).is_err() { + // If *this* fails too, something really weird happened + // and it's probably best to just panic. + signal::kill(pid, signal::SIGCONT).unwrap(); + } + } + }, // Child entered a syscall; we wait for exits inside of this, so it - // should never trigger on return from a syscall we care about + // should never trigger on return from a syscall we care about. ExecEvent::Syscall(pid) => { ptrace::syscall(pid, None).unwrap(); } @@ -220,6 +343,30 @@ pub fn sv_loop( unreachable!() } +/// Spawns a Capstone disassembler for the host architecture. +#[rustfmt::skip] +fn get_disasm() -> capstone::Capstone { + use capstone::prelude::*; + let cs_pre = Capstone::new(); + { + #[cfg(target_arch = "x86_64")] + {cs_pre.x86().mode(arch::x86::ArchMode::Mode64)} + #[cfg(target_arch = "x86")] + {cs_pre.x86().mode(arch::x86::ArchMode::Mode32)} + #[cfg(target_arch = "aarch64")] + {cs_pre.arm64().mode(arch::arm64::ArchMode::Arm)} + #[cfg(target_arch = "arm")] + {cs_pre.arm().mode(arch::arm::ArchMode::Arm)} + #[cfg(target_arch = "riscv64")] + {cs_pre.riscv().mode(arch::riscv::ArchMode::RiscV64)} + #[cfg(target_arch = "riscv32")] + {cs_pre.riscv().mode(arch::riscv::ArchMode::RiscV32)} + } + .detail(true) + .build() + .unwrap() +} + /// Waits for `wait_signal`. If `init_cont`, it will first do a `ptrace::cont`. /// We want to avoid that in some cases, like at the beginning of FFI. /// @@ -232,7 +379,7 @@ fn wait_for_signal( if init_cont { ptrace::cont(pid.unwrap(), None).unwrap(); } - // Repeatedly call `waitid` until we get the signal we want, or the process dies + // Repeatedly call `waitid` until we get the signal we want, or the process dies. loop { let wait_id = match pid { Some(pid) => wait::Id::Pid(pid), @@ -240,7 +387,7 @@ fn wait_for_signal( }; let stat = wait::waitid(wait_id, WAIT_FLAGS).map_err(|_| ExecError::Died(None))?; let (signal, pid) = match stat { - // Report the cause of death, if we know it + // Report the cause of death, if we know it. wait::WaitStatus::Exited(_, code) => { return Err(ExecError::Died(Some(code))); } @@ -248,7 +395,7 @@ fn wait_for_signal( wait::WaitStatus::Stopped(pid, signal) => (signal, pid), wait::WaitStatus::PtraceEvent(pid, signal, _) => (signal, pid), // This covers PtraceSyscall and variants that are impossible with - // the flags set (e.g. WaitStatus::StillAlive) + // the flags set (e.g. WaitStatus::StillAlive). _ => { ptrace::cont(pid.unwrap(), None).unwrap(); continue; @@ -257,7 +404,303 @@ fn wait_for_signal( if signal == wait_signal { return Ok(pid); } else { - ptrace::cont(pid, None).map_err(|_| ExecError::Died(None))?; + ptrace::cont(pid, signal).map_err(|_| ExecError::Died(None))?; + } + } +} + +/// Grabs the access that caused a segfault and logs it down if it's to our memory, +/// or kills the child and returns the appropriate error otherwise. +fn handle_segfault( + pid: unistd::Pid, + ch_pages: &[usize], + ch_stack: usize, + page_size: usize, + cs: &capstone::Capstone, + acc_events: &mut Vec, +) -> Result<(), ExecError> { + /// This is just here to not pollute the main namespace with `capstone::prelude::*`. + #[inline] + fn capstone_disassemble( + instr: &[u8], + addr: usize, + cs: &capstone::Capstone, + acc_events: &mut Vec, + ) -> capstone::CsResult<()> { + use capstone::prelude::*; + + // The arch_detail is what we care about, but it relies on these temporaries + // that we can't drop. 0x1000 is the default base address for Captsone, and + // we're expecting 1 instruction. + let insns = cs.disasm_count(instr, 0x1000, 1)?; + let ins_detail = cs.insn_detail(&insns[0])?; + let arch_detail = ins_detail.arch_detail(); + + for op in arch_detail.operands() { + match op { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + arch::ArchOperand::X86Operand(x86_operand) => { + match x86_operand.op_type { + // We only care about memory accesses + arch::x86::X86OperandType::Mem(_) => { + let push = addr..addr.strict_add(usize::from(x86_operand.size)); + // It's called a "RegAccessType" but it also applies to memory + let acc_ty = x86_operand.access.unwrap(); + if acc_ty.is_readable() { + acc_events.push(AccessEvent::Read(push.clone())); + } + if acc_ty.is_writable() { + acc_events.push(AccessEvent::Write(push)); + } + } + _ => (), + } + } + #[cfg(target_arch = "aarch64")] + arch::ArchOperand::Arm64Operand(arm64_operand) => { + // Annoyingly, we don't always get the size here, so just be pessimistic for now. + match arm64_operand.op_type { + arch::arm64::Arm64OperandType::Mem(_) => { + // B = 1 byte, H = 2 bytes, S = 4 bytes, D = 8 bytes, Q = 16 bytes. + let size = match arm64_operand.vas { + // Not an fp/simd instruction. + arch::arm64::Arm64Vas::ARM64_VAS_INVALID => ARCH_WORD_SIZE, + // 1 byte. + arch::arm64::Arm64Vas::ARM64_VAS_1B => 1, + // 2 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_1H => 2, + // 4 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_4B + | arch::arm64::Arm64Vas::ARM64_VAS_2H + | arch::arm64::Arm64Vas::ARM64_VAS_1S => 4, + // 8 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_8B + | arch::arm64::Arm64Vas::ARM64_VAS_4H + | arch::arm64::Arm64Vas::ARM64_VAS_2S + | arch::arm64::Arm64Vas::ARM64_VAS_1D => 8, + // 16 bytes. + arch::arm64::Arm64Vas::ARM64_VAS_16B + | arch::arm64::Arm64Vas::ARM64_VAS_8H + | arch::arm64::Arm64Vas::ARM64_VAS_4S + | arch::arm64::Arm64Vas::ARM64_VAS_2D + | arch::arm64::Arm64Vas::ARM64_VAS_1Q => 16, + }; + let push = addr..addr.strict_add(size); + // FIXME: This now has access type info in the latest + // git version of capstone because this pissed me off + // and I added it. Change this when it updates. + acc_events.push(AccessEvent::Read(push.clone())); + acc_events.push(AccessEvent::Write(push)); + } + _ => (), + } + } + #[cfg(target_arch = "arm")] + arch::ArchOperand::ArmOperand(arm_operand) => + match arm_operand.op_type { + arch::arm::ArmOperandType::Mem(_) => { + // We don't get info on the size of the access, but + // we're at least told if it's a vector instruction. + let size = if arm_operand.vector_index.is_some() { + ARCH_MAX_ACCESS_SIZE + } else { + ARCH_WORD_SIZE + }; + let push = addr..addr.strict_add(size); + let acc_ty = arm_operand.access.unwrap(); + if acc_ty.is_readable() { + acc_events.push(AccessEvent::Read(push.clone())); + } + if acc_ty.is_writable() { + acc_events.push(AccessEvent::Write(push)); + } + } + _ => (), + }, + #[cfg(any(target_arch = "riscv32", target_arch = "riscv64"))] + arch::ArchOperand::RiscVOperand(risc_voperand) => { + match risc_voperand { + arch::riscv::RiscVOperand::Mem(_) => { + // We get basically no info here. + let push = addr..addr.strict_add(size); + acc_events.push(AccessEvent::Read(push.clone())); + acc_events.push(AccessEvent::Write(push)); + } + _ => (), + } + } + _ => unimplemented!(), + } + } + + Ok(()) + } + + // Get information on what caused the segfault. This contains the address + // that triggered it. + let siginfo = ptrace::getsiginfo(pid).map_err(|_| ExecError::Shrug)?; + // All x86, ARM, etc. instructions only have at most one memory operand + // (thankfully!) + // SAFETY: si_addr is safe to call. + let addr = unsafe { siginfo.si_addr().addr() }; + let page_addr = addr.strict_sub(addr.strict_rem(page_size)); + + if ch_pages.iter().any(|pg| (*pg..pg.strict_add(page_size)).contains(&addr)) { + // Overall structure: + // - Get the address that caused the segfault + // - Unprotect the memory + // - Step 1 instruction + // - Parse executed code to estimate size & type of access + // - Reprotect the memory + // - Continue + + // Zero out the stack + for a in (ch_stack..ch_stack.strict_add(FAKE_STACK_SIZE)).step_by(ARCH_WORD_SIZE) { + ptrace::write(pid, std::ptr::with_exposed_provenance_mut(a), 0).unwrap(); + } + + let stack_ptr = ch_stack.strict_add(FAKE_STACK_SIZE / 2); + let regs_bak = ptrace::getregs(pid).unwrap(); + let mut new_regs = regs_bak; + let ip_prestep = regs_bak.ip(); + + // Move the instr ptr into the deprotection code. + #[expect(clippy::as_conversions)] + new_regs.set_ip(mempr_off as usize); + // Don't mess up the stack by accident! + new_regs.set_sp(stack_ptr); + + // Modify the PAGE_ADDR global on the child process to point to the page + // that we want unprotected. + ptrace::write( + pid, + (&raw const PAGE_ADDR).cast_mut().cast(), + libc::c_long::try_from(page_addr).unwrap(), + ) + .unwrap(); + + // Check if we also own the next page, and if so unprotect it in case + // the access spans the page boundary. + if ch_pages.contains(&page_addr.strict_add(page_size)) { + ptrace::write(pid, (&raw const PAGE_COUNT).cast_mut().cast(), 2).unwrap(); + } else { + ptrace::write(pid, (&raw const PAGE_COUNT).cast_mut().cast(), 1).unwrap(); + } + + ptrace::setregs(pid, new_regs).unwrap(); + + // Our mempr_* functions end with a raise(SIGSTOP). + wait_for_signal(Some(pid), signal::SIGSTOP, true)?; + + // Step 1 instruction. + ptrace::setregs(pid, regs_bak).unwrap(); + ptrace::step(pid, None).unwrap(); + // Don't use wait_for_signal here since 1 instruction doesn't give room + // for any uncertainty + we don't want it `cont()`ing randomly by accident + // Also, don't let it continue with unprotected memory if something errors! + let _ = wait::waitid(wait::Id::Pid(pid), WAIT_FLAGS).map_err(|_| ExecError::Died(None))?; + + // Zero it out again to be safe + for a in (ch_stack..ch_stack.strict_add(FAKE_STACK_SIZE)).step_by(ARCH_WORD_SIZE) { + ptrace::write(pid, std::ptr::with_exposed_provenance_mut(a), 0).unwrap(); } + + // Save registers and grab the bytes that were executed. This would + // be really nasty if it was a jump or similar but those thankfully + // won't do memory accesses and so can't trigger this! + let regs_bak = ptrace::getregs(pid).unwrap(); + new_regs = regs_bak; + let ip_poststep = regs_bak.ip(); + // We need to do reads/writes in word-sized chunks. + let diff = (ip_poststep.strict_sub(ip_prestep)).div_ceil(ARCH_WORD_SIZE); + let instr = (ip_prestep..ip_prestep.strict_add(diff)).fold(vec![], |mut ret, ip| { + // This only needs to be a valid pointer in the child process, not ours. + ret.append( + &mut ptrace::read(pid, std::ptr::without_provenance_mut(ip)) + .unwrap() + .to_ne_bytes() + .to_vec(), + ); + ret + }); + + // Now figure out the size + type of access and log it down + // This will mark down e.g. the same area being read multiple times, + // since it's more efficient to compress the accesses at the end. + if capstone_disassemble(&instr, addr, cs, acc_events).is_err() { + // Read goes first because we need to be pessimistic. + acc_events.push(AccessEvent::Read(addr..addr.strict_add(ARCH_MAX_ACCESS_SIZE))); + acc_events.push(AccessEvent::Write(addr..addr.strict_add(ARCH_MAX_ACCESS_SIZE))); + } + + // Reprotect everything and continue. + #[expect(clippy::as_conversions)] + new_regs.set_ip(mempr_on as usize); + new_regs.set_sp(stack_ptr); + ptrace::setregs(pid, new_regs).unwrap(); + wait_for_signal(Some(pid), signal::SIGSTOP, true)?; + + ptrace::setregs(pid, regs_bak).unwrap(); + ptrace::syscall(pid, None).unwrap(); + Ok(()) + } else { + // This was a real segfault, so print some debug info and quit. + let regs = ptrace::getregs(pid).unwrap(); + eprintln!("Segfault occurred during FFI at {addr:#018x}"); + eprintln!("Expected access on pages: {ch_pages:#018x?}"); + eprintln!("Register dump: {regs:#x?}"); + ptrace::kill(pid).unwrap(); + Err(ExecError::Died(None)) + } +} + +// We only get dropped into these functions via offsetting the instr pointer +// manually, so we *must not ever* unwind from them. + +/// Disables protections on the page whose address is currently in `PAGE_ADDR`. +/// +/// SAFETY: `PAGE_ADDR` should be set to a page-aligned pointer to an owned page, +/// `PAGE_SIZE` should be the host pagesize, and the range from `PAGE_ADDR` to +/// `PAGE_SIZE` * `PAGE_COUNT` must be owned and allocated memory. No other threads +/// should be running. +pub unsafe extern "C" fn mempr_off() { + use std::sync::atomic::Ordering; + + let len = PAGE_SIZE.load(Ordering::Relaxed).wrapping_mul(PAGE_COUNT.load(Ordering::Relaxed)); + // SAFETY: Upheld by "caller". + unsafe { + // It's up to the caller to make sure this doesn't actually overflow, but + // we mustn't unwind from here, so... + if libc::mprotect( + PAGE_ADDR.load(Ordering::Relaxed).cast(), + len, + libc::PROT_READ | libc::PROT_WRITE, + ) != 0 + { + // Can't return or unwind, but we can do this. + std::process::exit(-1); + } + } + // If this fails somehow we're doomed. + if signal::raise(signal::SIGSTOP).is_err() { + std::process::exit(-1); + } +} + +/// Reenables protection on the page set by `PAGE_ADDR`. +/// +/// SAFETY: See `mempr_off()`. +pub unsafe extern "C" fn mempr_on() { + use std::sync::atomic::Ordering; + + let len = PAGE_SIZE.load(Ordering::Relaxed).wrapping_mul(PAGE_COUNT.load(Ordering::Relaxed)); + // SAFETY: Upheld by "caller". + unsafe { + if libc::mprotect(PAGE_ADDR.load(Ordering::Relaxed).cast(), len, libc::PROT_NONE) != 0 { + std::process::exit(-1); + } + } + if signal::raise(signal::SIGSTOP).is_err() { + std::process::exit(-1); } }