diff --git a/Cargo.lock b/Cargo.lock index cecb2d566c3..613122af7d1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -103,9 +103,9 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299" [[package]] name = "anstream" -version = "0.6.15" +version = "0.6.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" +checksum = "8acc5369981196006228e28809f761875c0327210a891e941f4c683b3a99529b" dependencies = [ "anstyle", "anstyle-parse", @@ -142,12 +142,13 @@ dependencies = [ [[package]] name = "anstyle-wincon" -version = "3.0.4" +version = "3.0.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" +checksum = "ca3534e77181a9cc07539ad51f2141fe32f6c3ffd4df76db8ad92346b003ae4e" dependencies = [ "anstyle", - "windows-sys 0.52.0", + "once_cell", + "windows-sys 0.59.0", ] [[package]] @@ -1065,6 +1066,22 @@ dependencies = [ "tempfile", ] +[[package]] +name = "camino-tempfile-ext" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7b3ab964bd19840be2c70b7795188187a8c1c90006a310b1500bf67b35a80d3" +dependencies = [ + "anstream", + "anstyle", + "camino", + "camino-tempfile", + "globwalk", + "predicates", + "predicates-core", + "predicates-tree", +] + [[package]] name = "cancel-safe-futures" version = "0.1.5" @@ -3724,6 +3741,17 @@ dependencies = [ "regex-syntax 0.8.5", ] +[[package]] +name = "globwalk" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf760ebf69878d9fd8f110c89703d90ce35095324d1f1edcb595c63945ee757" +dependencies = [ + "bitflags 2.9.0", + "ignore", + "walkdir", +] + [[package]] name = "gloo-timers" version = "0.3.0" @@ -4563,6 +4591,22 @@ dependencies = [ "icu_properties", ] +[[package]] +name = "ignore" +version = "0.4.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d89fd380afde86567dfba715db065673989d6253f42b88179abd3eae47bda4b" +dependencies = [ + "crossbeam-deque", + "globset", + "log", + "memchr", + "regex-automata", + "same-file", + "walkdir", + "winapi-util", +] + [[package]] name = "illumos-devinfo" version = "0.1.0" @@ -8460,7 +8504,7 @@ dependencies = [ "strum", "tabled 0.15.0", "tempfile", - "termtree 0.5.1", + "termtree", "thiserror 1.0.69", "tokio", "tokio-util", @@ -9432,12 +9476,12 @@ checksum = "ae8177bee8e75d6846599c6b9ff679ed51e882816914eec639944d7c9aa11931" [[package]] name = "predicates-tree" -version = "1.0.11" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41b740d195ed3166cd147c8047ec98db0e22ec019eb8eeb76d343b795304fb13" +checksum = "72dd2d6d381dfb73a193c7fca536518d7caee39fc8503f74e7dc0be0531b425c" dependencies = [ "predicates-core", - "termtree 0.4.1", + "termtree", ] [[package]] @@ -11556,9 +11600,20 @@ version = "0.1.0" dependencies = [ "anyhow", "camino", + "camino-tempfile-ext", + "dropshot", + "id-map", + "illumos-utils", "nexus-sled-agent-shared", + "omicron-common", + "omicron-uuid-kinds", "omicron-workspace-hack", + "pretty_assertions", + "serde_json", "sled-storage", + "slog", + "slog-error-chain", + "thiserror 1.0.69", ] [[package]] @@ -12480,12 +12535,6 @@ dependencies = [ "libc", ] -[[package]] -name = "termtree" -version = "0.4.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" - [[package]] name = "termtree" version = "0.5.1" diff --git a/Cargo.toml b/Cargo.toml index b3fc96fc819..f63050ca124 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -365,6 +365,7 @@ byteorder = "1.5.0" bytes = "1.10.1" camino = { version = "1.1", features = ["serde1"] } camino-tempfile = "1.3.0" +camino-tempfile-ext = { version = "0.3.0", features = ["assert-color"] } cancel-safe-futures = "0.1.5" cargo_metadata = "0.19.2" chacha20poly1305 = "0.10.1" diff --git a/sled-agent/src/bootstrap/pre_server.rs b/sled-agent/src/bootstrap/pre_server.rs index 35a8c1f3b86..fe456bb1f5e 100644 --- a/sled-agent/src/bootstrap/pre_server.rs +++ b/sled-agent/src/bootstrap/pre_server.rs @@ -148,6 +148,7 @@ impl BootstrapAgentStartup { config.switch_zone_maghemite_links.clone(), long_running_task_handles.storage_manager.clone(), long_running_task_handles.zone_bundler.clone(), + long_running_task_handles.zone_image_resolver.clone(), ); // Inform the hardware monitor that the service manager is ready diff --git a/sled-agent/src/long_running_tasks.rs b/sled-agent/src/long_running_tasks.rs index 6475eebad0d..ce0c3b3f6fe 100644 --- a/sled-agent/src/long_running_tasks.rs +++ b/sled-agent/src/long_running_tasks.rs @@ -23,12 +23,15 @@ use crate::sled_agent::SledAgent; use crate::storage_monitor::{StorageMonitor, StorageMonitorHandle}; use crate::zone_bundle::ZoneBundler; use bootstore::schemes::v0 as bootstore; +use illumos_utils::zpool::ZpoolName; use key_manager::{KeyManager, StorageKeyRequester}; use sled_agent_types::zone_bundle::CleanupContext; +use sled_agent_zone_images::{ZoneImageSourceResolver, ZoneImageZpools}; use sled_hardware::{HardwareManager, SledMode, UnparsedDisk}; use sled_storage::config::MountConfig; use sled_storage::disk::RawSyntheticDisk; use sled_storage::manager::{StorageHandle, StorageManager}; +use sled_storage::resources::AllDisks; use slog::{Logger, info}; use std::net::Ipv6Addr; use tokio::sync::oneshot; @@ -59,6 +62,13 @@ pub struct LongRunningTaskHandles { // A reference to the object used to manage zone bundles pub zone_bundler: ZoneBundler, + + /// Resolver for zone image sources. + /// + /// This isn't really implemented in the backend as a task per se, but it + /// looks like one from the outside, and is convenient to put here. (If it + /// had any async involved within it, it would be a task.) + pub zone_image_resolver: ZoneImageSourceResolver, } /// Spawn all long running tasks @@ -95,18 +105,21 @@ pub async fn spawn_all_longrunning_tasks( // Wait for the boot disk so that we can work with any ledgers, // such as those needed by the bootstore and sled-agent info!(log, "Waiting for boot disk"); - let (disk_id, _) = storage_manager.wait_for_boot_disk().await; + let (disk_id, boot_zpool) = storage_manager.wait_for_boot_disk().await; info!(log, "Found boot disk {:?}", disk_id); + let all_disks = storage_manager.get_latest_disks().await; let bootstore = spawn_bootstore_tasks( log, - &mut storage_manager, + &all_disks, &hardware_manager, global_zone_bootstrap_ip, ) .await; let zone_bundler = spawn_zone_bundler_tasks(log, &mut storage_manager); + let zone_image_resolver = + make_zone_image_resolver(log, &all_disks, &boot_zpool); ( LongRunningTaskHandles { @@ -116,6 +129,7 @@ pub async fn spawn_all_longrunning_tasks( hardware_manager, bootstore, zone_bundler, + zone_image_resolver, }, sled_agent_started_tx, service_manager_ready_tx, @@ -195,13 +209,12 @@ fn spawn_hardware_monitor( async fn spawn_bootstore_tasks( log: &Logger, - storage_handle: &mut StorageHandle, + all_disks: &AllDisks, hardware_manager: &HardwareManager, global_zone_bootstrap_ip: Ipv6Addr, ) -> bootstore::NodeHandle { - let iter_all = storage_handle.get_latest_disks().await; let config = new_bootstore_config( - &iter_all, + all_disks, hardware_manager.baseboard(), global_zone_bootstrap_ip, ) @@ -233,6 +246,18 @@ fn spawn_zone_bundler_tasks( ZoneBundler::new(log, storage_handle.clone(), CleanupContext::default()) } +fn make_zone_image_resolver( + log: &Logger, + all_disks: &AllDisks, + boot_zpool: &ZpoolName, +) -> ZoneImageSourceResolver { + let zpools = ZoneImageZpools { + root: &all_disks.mount_config().root, + all_m2_zpools: all_disks.all_m2_zpools(), + }; + ZoneImageSourceResolver::new(log, &zpools, boot_zpool) +} + async fn upsert_synthetic_disks_if_needed( log: &Logger, storage_manager: &StorageHandle, diff --git a/sled-agent/src/services.rs b/sled-agent/src/services.rs index c36953d20a9..4396f77f375 100644 --- a/sled-agent/src/services.rs +++ b/sled-agent/src/services.rs @@ -929,6 +929,7 @@ impl ServiceManager { switch_zone_maghemite_links: Vec, storage: StorageHandle, zone_bundler: ZoneBundler, + zone_image_resolver: ZoneImageSourceResolver, ) -> Self { Self::new_inner( log, @@ -940,6 +941,7 @@ impl ServiceManager { switch_zone_maghemite_links, storage, zone_bundler, + zone_image_resolver, RealSystemApi::new(), ) } @@ -955,6 +957,7 @@ impl ServiceManager { switch_zone_maghemite_links: Vec, storage: StorageHandle, zone_bundler: ZoneBundler, + zone_image_resolver: ZoneImageSourceResolver, system_api: Box, ) -> Self { let log = log.new(o!("component" => "ServiceManager")); @@ -990,7 +993,7 @@ impl ServiceManager { .switch_zone_bootstrap_ip, storage, zone_bundler, - zone_image_resolver: ZoneImageSourceResolver::new(), + zone_image_resolver, ledger_directory_override: OnceLock::new(), system_api, }), @@ -5141,6 +5144,7 @@ mod illumos_tests { use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use omicron_uuid_kinds::OmicronZoneUuid; + use sled_agent_zone_images::ZoneImageZpools; use sled_storage::manager_test_harness::StorageManagerTestHarness; use std::{ net::{Ipv6Addr, SocketAddrV6}, @@ -5342,6 +5346,7 @@ mod illumos_tests { log: slog::Logger, storage_test_harness: StorageManagerTestHarness, zone_bundler: ZoneBundler, + zone_image_resolver: ZoneImageSourceResolver, test_config: &'a TestConfig, } @@ -5354,10 +5359,21 @@ mod illumos_tests { Default::default(), ); + let mut storage_manager = storage_test_harness.handle().clone(); + let all_disks = storage_manager.get_latest_disks().await; + let (_, boot_zpool) = storage_manager.wait_for_boot_disk().await; + let zpools = ZoneImageZpools { + root: &all_disks.mount_config().root, + all_m2_zpools: all_disks.all_m2_zpools(), + }; + let zone_image_resolver = + ZoneImageSourceResolver::new(&log, &zpools, &boot_zpool); + LedgerTestHelper { log, storage_test_harness, zone_bundler, + zone_image_resolver, test_config, } } @@ -5392,6 +5408,7 @@ mod illumos_tests { vec![], self.storage_test_harness.handle().clone(), self.zone_bundler.clone(), + self.zone_image_resolver.clone(), system, ); self.test_config.override_paths(&mgr); diff --git a/sled-agent/zone-images/Cargo.toml b/sled-agent/zone-images/Cargo.toml index 57c9ac92c63..5c388a19dd2 100644 --- a/sled-agent/zone-images/Cargo.toml +++ b/sled-agent/zone-images/Cargo.toml @@ -10,6 +10,19 @@ workspace = true [dependencies] anyhow.workspace = true camino.workspace = true +id-map.workspace = true +illumos-utils.workspace = true nexus-sled-agent-shared.workspace = true +omicron-common.workspace = true omicron-workspace-hack.workspace = true +serde_json.workspace = true sled-storage.workspace = true +slog.workspace = true +slog-error-chain.workspace = true +thiserror.workspace = true + +[dev-dependencies] +camino-tempfile-ext.workspace = true +dropshot.workspace = true +omicron-uuid-kinds.workspace = true +pretty_assertions.workspace = true diff --git a/sled-agent/zone-images/src/lib.rs b/sled-agent/zone-images/src/lib.rs index 6355188d7e0..a6956e0956c 100644 --- a/sled-agent/zone-images/src/lib.rs +++ b/sled-agent/zone-images/src/lib.rs @@ -7,6 +7,8 @@ //! This contains a subset of zone image code at the moment: you're encouraged //! to move more code into this crate as appropriate. +mod mupdate_override; mod source_resolver; +pub(crate) use mupdate_override::*; pub use source_resolver::*; diff --git a/sled-agent/zone-images/src/mupdate_override.rs b/sled-agent/zone-images/src/mupdate_override.rs new file mode 100644 index 00000000000..0b2337f3110 --- /dev/null +++ b/sled-agent/zone-images/src/mupdate_override.rs @@ -0,0 +1,1016 @@ +// This Source Code Form is subject to the terms of the Mozilla Public +// License, v. 2.0. If a copy of the MPL was not distributed with this +// file, You can obtain one at https://mozilla.org/MPL/2.0/. + +//! Track MUPdate overrides within sled-agent. +//! +//! For more about commingling MUPdate and update, see RFD 556. + +use std::fs; +use std::fs::FileType; +use std::sync::Arc; + +use crate::ZoneImageZpools; +use camino::Utf8Path; +use camino::Utf8PathBuf; +use id_map::IdMap; +use id_map::IdMappable; +use illumos_utils::zpool::ZpoolName; +use omicron_common::update::MupdateOverrideInfo; +use sled_storage::dataset::INSTALL_DATASET; +use slog::debug; +use slog::error; +use slog::info; +use slog::o; +use slog::warn; +use slog_error_chain::InlineErrorChain; +use thiserror::Error; + +#[derive(Debug)] +pub(crate) struct AllMupdateOverrides { + boot_zpool: ZpoolName, + boot_disk_path: Utf8PathBuf, + boot_disk_override: + Result, MupdateOverrideReadError>, + non_boot_disk_overrides: IdMap, +} + +impl AllMupdateOverrides { + /// Attempt to find MUPdate override files. If present, this file will cause + /// install-dataset artifacts to be used even if the image source is Artifact. + /// + /// For now we treat the boot disk as authoritative, since install-dataset + /// artifacts are always served from the boot disk. There is a class of issues + /// here related to transient failures on one of the M.2s that we're + /// acknowledging but not tackling for now. + /// + /// In general, this API follows an interpreter pattern: first read all the + /// results and put them in a map, then make decisions based on them in a + /// separate step. This enables better testing and ensures that changes to + /// behavior are described in the type system. + /// + /// For more about commingling MUPdate and update, see RFD 556. + /// + /// TODO: This is somewhat complex error handling logic that's similar to, + /// but different from, `Ledgerable` (for example, it only does an equality + /// check, not an ordering check, and it always considers the boot disk to + /// be authoritative). Consider extracting this out into something generic. + pub(crate) fn read_all( + log: &slog::Logger, + zpools: &ZoneImageZpools<'_>, + boot_zpool: &ZpoolName, + ) -> Self { + let dataset = + boot_zpool.dataset_mountpoint(zpools.root, INSTALL_DATASET); + + let (boot_disk_path, boot_disk_res) = + read_mupdate_override(log, &dataset); + + // Now read the file from all other disks. We attempt to make sure they + // match up and will log a warning if they don't, though (until we have + // a better story on transient failures) it's not fatal. + let non_boot_zpools = zpools + .all_m2_zpools + .iter() + .filter(|&zpool_name| zpool_name != boot_zpool); + let non_boot_disks_overrides = non_boot_zpools + .map(|zpool_name| { + let dataset = + zpool_name.dataset_mountpoint(zpools.root, INSTALL_DATASET); + + let (path, res) = read_mupdate_override(log, &dataset); + MupdateOverrideNonBootInfo { + zpool_name: *zpool_name, + path, + result: MupdateOverrideNonBootResult::new( + res, + &boot_disk_res, + ), + } + }) + .collect(); + + let ret = Self { + boot_zpool: *boot_zpool, + boot_disk_path, + boot_disk_override: boot_disk_res, + non_boot_disk_overrides: non_boot_disks_overrides, + }; + + ret.log_results(&log); + ret + } + + fn log_results(&self, log: &slog::Logger) { + let log = log.new(o!( + "boot_zpool" => self.boot_zpool.to_string(), + "boot_disk_path" => self.boot_disk_path.to_string(), + )); + + match &self.boot_disk_override { + Ok(Some(mupdate_override)) => { + info!( + log, + "found mupdate override for boot disk"; + "data" => ?mupdate_override, + ); + } + Ok(None) => { + info!(log, "no mupdate override for boot disk"); + } + Err(error) => { + // This error most likely requires operator intervention -- if + // it happens, we'll continue to bring sled-agent up but reject + // all zone image lookups. + error!( + log, + "error reading mupdate override for boot disk, \ + will not bring up zones"; + "error" => InlineErrorChain::new(error), + ); + } + } + + if self.non_boot_disk_overrides.is_empty() { + warn!( + log, + "no non-boot zpools found, unable to verify consistency -- \ + this may be a hardware issue with the non-boot M.2" + ); + } + + for info in &self.non_boot_disk_overrides { + info.log_result(&log); + } + } +} + +fn read_mupdate_override( + log: &slog::Logger, + dataset_dir: &Utf8Path, +) -> (Utf8PathBuf, Result, MupdateOverrideReadError>) +{ + let override_path = dataset_dir.join(MupdateOverrideInfo::FILE_NAME); + + fn inner( + log: &slog::Logger, + dataset_dir: &Utf8Path, + override_path: &Utf8Path, + ) -> Result, MupdateOverrideReadError> { + // First check that the dataset directory exists. This distinguishes the + // two cases: + // + // 1. The install dataset is missing (an error). + // 2. The install dataset is present, but the override file is missing + // (expected). + // + // It would be nice if we could use openat-style APIs to read the file + // from the opened directory, but: + // + // * those don't exist in rust std + // * it's not crucial -- we don't expect TOCTTOU races much in this code + // path, and we're not generally resilient to them anyway. + // + // We use symlink_metadata (lstat) rather than metadata (stat) because + // there really shouldn't be any symlinks involved. + let dir_metadata = + fs::symlink_metadata(dataset_dir).map_err(|error| { + MupdateOverrideReadError::DatasetDirMetadata { + dataset_dir: dataset_dir.to_owned(), + error: Arc::new(error), + } + })?; + if !dir_metadata.is_dir() { + return Err(MupdateOverrideReadError::DatasetNotDirectory { + dataset_dir: dataset_dir.to_owned(), + file_type: dir_metadata.file_type(), + }); + } + + let mupdate_override = match std::fs::read_to_string(&override_path) { + Ok(data) => { + let data = serde_json::from_str::(&data) + .map_err(|error| { + MupdateOverrideReadError::Deserialize { + path: override_path.to_owned(), + error: Arc::new(error), + contents: data, + } + })?; + Some(data) + } + Err(error) => { + if error.kind() == std::io::ErrorKind::NotFound { + debug!( + log, + "mupdate override file not found, treating as absent"; + "path" => %override_path + ); + None + } else { + return Err(MupdateOverrideReadError::Read { + path: override_path.to_owned(), + error: Arc::new(error), + }); + } + } + }; + + Ok(mupdate_override) + } + + let res = inner(log, dataset_dir, &override_path); + (override_path, res) +} + +#[derive(Clone, Debug, PartialEq)] +struct MupdateOverrideNonBootInfo { + /// The name of the zpool. + zpool_name: ZpoolName, + + /// The path that was read from. + path: Utf8PathBuf, + + /// The result of performing the read operation. + result: MupdateOverrideNonBootResult, +} + +impl MupdateOverrideNonBootInfo { + fn log_result(&self, log: &slog::Logger) { + let log = log.new(o!( + "non_boot_zpool_name" => self.zpool_name.to_string(), + "non_boot_path" => self.path.to_string(), + )); + + match &self.result { + MupdateOverrideNonBootResult::MatchesAbsent => { + info!( + log, + "mupdate override absent on this non-boot \ + disk, matches absence on boot disk", + ); + } + MupdateOverrideNonBootResult::MatchesPresent => { + info!( + log, + "mupdate override present on this non-boot \ + disk, matches presence on boot disk", + ); + } + MupdateOverrideNonBootResult::ReadError(error) => { + warn!( + log, + "failed to read mupdate override from other disk"; + "error" => InlineErrorChain::new(error), + ); + } + MupdateOverrideNonBootResult::Mismatch(mismatch) => { + mismatch.log_to(&log) + } + } + } +} + +impl IdMappable for MupdateOverrideNonBootInfo { + type Id = ZpoolName; + + fn id(&self) -> Self::Id { + self.zpool_name + } +} + +#[derive(Clone, Debug, PartialEq)] +enum MupdateOverrideNonBootResult { + /// The override is present and matches the value on the boot disk. + MatchesPresent, + + /// The override is absent and is also absent on the boot disk. + MatchesAbsent, + + /// A mismatch between the boot disk and the other disk was detected. + Mismatch(MupdateOverrideNonBootMismatch), + + /// An error occurred while reading the mupdate override info on this disk. + ReadError(MupdateOverrideReadError), +} + +impl MupdateOverrideNonBootResult { + fn new( + res: Result, MupdateOverrideReadError>, + boot_disk_res: &Result< + Option, + MupdateOverrideReadError, + >, + ) -> Self { + match (res, boot_disk_res) { + (Ok(Some(non_boot_disk_info)), Ok(Some(boot_disk_info))) => { + if boot_disk_info == &non_boot_disk_info { + Self::MatchesPresent + } else { + Self::Mismatch( + MupdateOverrideNonBootMismatch::ValueMismatch { + non_boot_disk_info, + }, + ) + } + } + (Ok(Some(non_boot_disk_info)), Ok(None)) => Self::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + non_boot_disk_info, + }, + ), + (Ok(None), Ok(Some(_))) => Self::Mismatch( + MupdateOverrideNonBootMismatch::BootPresentOtherAbsent, + ), + (Ok(None), Ok(None)) => Self::MatchesAbsent, + (Ok(non_boot_disk_info), Err(_)) => Self::Mismatch( + MupdateOverrideNonBootMismatch::BootDiskReadError { + non_boot_disk_info, + }, + ), + (Err(error), _) => Self::ReadError(error), + } + } +} + +#[derive(Clone, Debug, PartialEq, Eq)] +enum MupdateOverrideNonBootMismatch { + /// The override is present on the boot disk but absent on the other disk. + BootPresentOtherAbsent, + + /// The override is absent on the boot disk but present on the other disk. + BootAbsentOtherPresent { + /// The information found on the other disk. + non_boot_disk_info: MupdateOverrideInfo, + }, + + /// The override value differs between the boot disk and the other disk. + ValueMismatch { non_boot_disk_info: MupdateOverrideInfo }, + + /// There was a read error on the boot disk, so we were unable to verify + /// consistency. + BootDiskReadError { + /// The value as found on this disk. This value is logged but not used. + non_boot_disk_info: Option, + }, +} + +impl MupdateOverrideNonBootMismatch { + // This function assumes that `log` has already been provided context about + // the zpool name and path. + fn log_to(&self, log: &slog::Logger) { + match self { + Self::BootPresentOtherAbsent => { + warn!( + log, + "mupdate override absent on this non-boot disk but \ + present on boot disk, treating file on boot disk as \ + authoritative" + ) + } + Self::BootAbsentOtherPresent { non_boot_disk_info } => { + warn!( + log, + "mupdate override present on this non-boot disk but \ + absent on boot disk, treating the absence on boot disk \ + as authoritative"; + "non_boot_disk_info" => ?non_boot_disk_info, + ) + } + Self::ValueMismatch { non_boot_disk_info } => { + warn!( + log, + "mupdate override value present on both this non-boot \ + disk and the boot disk, but different across disks, \ + treating boot disk as authoritative"; + "non_boot_disk_info" => ?non_boot_disk_info, + ) + } + Self::BootDiskReadError { non_boot_disk_info } => { + warn!( + log, + "mupdate override read error on boot disk, unable \ + to verify consistency across disks"; + "non_boot_disk_info" => ?non_boot_disk_info, + ) + } + } + } +} + +#[derive(Clone, Debug, Error)] +enum MupdateOverrideReadError { + #[error( + "error retrieving metadata for install dataset directory \ + `{dataset_dir}`" + )] + DatasetDirMetadata { + dataset_dir: Utf8PathBuf, + #[source] + error: Arc, + }, + + #[error( + "expected install dataset `{dataset_dir}` to be a directory, \ + found {file_type:?}" + )] + DatasetNotDirectory { dataset_dir: Utf8PathBuf, file_type: FileType }, + + #[error("error reading mupdate override from `{path}`")] + Read { + path: Utf8PathBuf, + #[source] + error: Arc, + }, + + #[error( + "error deserializing `{path}` into MupdateOverrideInfo, \ + contents: {contents:?}" + )] + Deserialize { + path: Utf8PathBuf, + contents: String, + #[source] + error: Arc, + }, +} + +/// This aids tremendously in testing. +/// +/// `MupdateOverrideReadError` forms an equivalence class (i.e. it satisfies the +/// reflexive property `a == a`), so it should in principle be okay to implement +/// `Eq` for it as well. But this algorithm doesn't fully compare for equality, +/// just enough for tests, so we don't implement `Eq`. +/// +/// (But if there's a use case for `Eq` in the future, we should consider +/// implementing it.) +impl PartialEq for MupdateOverrideReadError { + fn eq(&self, other: &Self) -> bool { + match (self, other) { + ( + Self::DatasetDirMetadata { dataset_dir: dir1, error: error1 }, + Self::DatasetDirMetadata { dataset_dir: dir2, error: error2 }, + ) => { + // Simply comparing io::ErrorKind is good enough for tests. + dir1 == dir2 && error1.kind() == error2.kind() + } + ( + Self::DatasetNotDirectory { + dataset_dir: dir1, + file_type: type1, + }, + Self::DatasetNotDirectory { + dataset_dir: dir2, + file_type: type2, + }, + ) => dir1 == dir2 && type1 == type2, + ( + Self::Read { path: path1, error: error1 }, + Self::Read { path: path2, error: error2 }, + ) => { + // Simply comparing io::ErrorKind is good enough for tests. + path1 == path2 && error1.kind() == error2.kind() + } + ( + Self::Deserialize { + path: path1, + contents: contents1, + error: error1, + }, + Self::Deserialize { + path: path2, + contents: contents2, + error: error2, + }, + ) => { + // Comparing error line/column/category is enough for tests. + path1 == path2 + && contents1 == contents2 + && error1.line() == error2.line() + && error1.column() == error2.column() + && error1.classify() == error2.classify() + } + _ => false, + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use camino_tempfile_ext::prelude::*; + use dropshot::ConfigLogging; + use dropshot::ConfigLoggingLevel; + use dropshot::test_util::LogContext; + use omicron_uuid_kinds::MupdateOverrideUuid; + use omicron_uuid_kinds::ZpoolUuid; + use pretty_assertions::assert_eq; + use std::collections::BTreeSet; + use std::io; + use std::sync::LazyLock; + + struct OverridePaths { + install_dataset: Utf8PathBuf, + override_json: Utf8PathBuf, + } + + impl OverridePaths { + fn for_uuid(uuid: ZpoolUuid) -> Self { + let install_dataset = + Utf8PathBuf::from(format!("pool/int/{uuid}/install")); + let mupdate_override_json = + install_dataset.join("mupdate-override.json"); + Self { install_dataset, override_json: mupdate_override_json } + } + } + + const BOOT_UUID: ZpoolUuid = + ZpoolUuid::from_u128(0xd3e7205d_4efe_493b_ac5e_9175584907cd); + const BOOT_ZPOOL: ZpoolName = ZpoolName::new_internal(BOOT_UUID); + static BOOT_PATHS: LazyLock = + LazyLock::new(|| OverridePaths::for_uuid(BOOT_UUID)); + + const NON_BOOT_UUID: ZpoolUuid = + ZpoolUuid::from_u128(0x4854189f_b290_47cd_b076_374d0e1748ec); + const NON_BOOT_ZPOOL: ZpoolName = ZpoolName::new_internal(NON_BOOT_UUID); + static NON_BOOT_PATHS: LazyLock = + LazyLock::new(|| OverridePaths::for_uuid(NON_BOOT_UUID)); + + const NON_BOOT_2_UUID: ZpoolUuid = + ZpoolUuid::from_u128(0x72201e1e_9fee_4231_81cd_4e2d514cb632); + const NON_BOOT_2_ZPOOL: ZpoolName = + ZpoolName::new_internal(NON_BOOT_2_UUID); + static NON_BOOT_2_PATHS: LazyLock = + LazyLock::new(|| OverridePaths::for_uuid(NON_BOOT_2_UUID)); + + const NON_BOOT_3_UUID: ZpoolUuid = + ZpoolUuid::from_u128(0xd0d04947_93c5_40fd_97ab_4648b8cc28d6); + const NON_BOOT_3_ZPOOL: ZpoolName = + ZpoolName::new_internal(NON_BOOT_3_UUID); + static NON_BOOT_3_PATHS: LazyLock = + LazyLock::new(|| OverridePaths::for_uuid(NON_BOOT_3_UUID)); + + static OVERRIDE_UUID: MupdateOverrideUuid = + MupdateOverrideUuid::from_u128(0x70b965c2_fc95_4843_a34d_a2c7246788a8); + static OVERRIDE_2_UUID: MupdateOverrideUuid = + MupdateOverrideUuid::from_u128(0x20588f8f_c680_4101_afc7_820226d03ada); + + /// Boot disk present / no other disks. (This produces a warning, but is + /// otherwise okay.) + #[test] + fn read_solo_boot_disk() { + let logctx = LogContext::new( + "mupdate_override_read_other_absent", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let override_info = override_info(); + let dir = Utf8TempDir::new().unwrap(); + + dir.child(&BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info).unwrap()) + .unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL], + }; + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap().as_ref(), + Some(&override_info) + ); + assert_eq!(overrides.non_boot_disk_overrides, IdMap::new()); + + logctx.cleanup_successful(); + } + + /// Matching case: boot disk present / other disk present. + #[test] + fn read_both_present() { + let logctx = LogContext::new( + "mupdate_override_read_both_present", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let override_info = override_info(); + let dir = Utf8TempDir::new().unwrap(); + + dir.child(&BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info).unwrap()) + .unwrap(); + dir.child(&NON_BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info).unwrap()) + .unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL], + }; + + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap().as_ref(), + Some(&override_info) + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::MatchesPresent, + }] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + /// Matching case: boot disk absent / other disk absent. + #[test] + fn read_both_absent() { + let logctx = LogContext::new( + "mupdate_override_read_both_absent", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + + // Create the directories but not the override JSONs within them. + dir.child(&BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + dir.child(&NON_BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL], + }; + + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap().as_ref(), + None, + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::MatchesAbsent, + }] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + /// Mismatch case: Boot disk present / other disk absent. + #[test] + fn read_boot_present_other_absent() { + let logctx = LogContext::new( + "mupdate_override_read_boot_present_other_absent", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let override_info = override_info(); + let dir = Utf8TempDir::new().unwrap(); + + dir.child(&BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info).unwrap()) + .unwrap(); + // Create the directory, but not the override JSON within it. + dir.child(&NON_BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL], + }; + + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap().as_ref(), + Some(&override_info) + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootPresentOtherAbsent, + ), + }] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + /// Mismatch case: Boot disk absent / other disk present. + #[test] + fn read_boot_absent_other_present() { + let logctx = LogContext::new( + "mupdate_override_read_boot_absent_other_present", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let override_info = override_info(); + let dir = Utf8TempDir::new().unwrap(); + + // Create the directory, but not the override JSON within it. + dir.child(&BOOT_PATHS.install_dataset).create_dir_all().unwrap(); + + dir.child(&NON_BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info).unwrap()) + .unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL], + }; + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap().as_ref(), + None, + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootAbsentOtherPresent { + non_boot_disk_info: override_info.clone() + }, + ), + }] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + /// Mismatch case: present on both disks but values differ. + #[test] + fn read_different_values() { + let logctx = LogContext::new( + "mupdate_override_read_different_values", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let override_info = override_info(); + let override_info_2 = override_info_2(); + let dir = Utf8TempDir::new().unwrap(); + + dir.child(&BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info).unwrap()) + .expect("failed to write override json"); + dir.child(&NON_BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info_2).unwrap()) + .expect("failed to write override json"); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL], + }; + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap().as_ref(), + Some(&override_info), + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::ValueMismatch { + non_boot_disk_info: override_info_2, + } + ), + }] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + /// Error case: boot and other install datasets don't exist (possibly not + /// mounted? This is a strange situation.) + #[test] + fn read_boot_install_dataset_missing() { + let logctx = LogContext::new( + "mupdate_override_read_boot_install_dataset_missing", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + + // Create the parent directory but not the install dataset directory. + dir.child(&BOOT_PATHS.install_dataset.parent().unwrap()) + .create_dir_all() + .unwrap(); + dir.child(&NON_BOOT_PATHS.install_dataset.parent().unwrap()) + .create_dir_all() + .unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL], + }; + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap_err(), + &dataset_missing_error( + &dir.path().join(&BOOT_PATHS.install_dataset) + ) + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::ReadError( + dataset_missing_error( + &dir.path().join(&NON_BOOT_PATHS.install_dataset) + ), + ) + }] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + /// Error case: boot and other install datasets are not directories + #[test] + fn read_boot_install_dataset_not_dir() { + let logctx = LogContext::new( + "mupdate_override_read_boot_install_dataset_missing", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let dir = Utf8TempDir::new().unwrap(); + + // Make the install directory paths files -- fun! + dir.child(&BOOT_PATHS.install_dataset).touch().unwrap(); + dir.child(&NON_BOOT_PATHS.install_dataset).touch().unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![BOOT_ZPOOL, NON_BOOT_ZPOOL], + }; + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap_err(), + &dataset_not_dir_error( + &dir.path().join(&BOOT_PATHS.install_dataset) + ) + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::ReadError( + dataset_not_dir_error( + &dir.path().join(&NON_BOOT_PATHS.install_dataset), + ), + ), + }] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + /// Error case: Boot read error / other present/absent/deserialize error. + #[test] + fn read_boot_read_error() { + let logctx = LogContext::new( + "mupdate_override_read_boot_read_error", + &ConfigLogging::StderrTerminal { level: ConfigLoggingLevel::Debug }, + ); + let override_info = override_info(); + let dir = Utf8TempDir::new().unwrap(); + + // Create an empty file: this won't deserialize correctly. + dir.child(&BOOT_PATHS.override_json).touch().unwrap(); + // File with the correct contents. + dir.child(&NON_BOOT_PATHS.override_json) + .write_str(&serde_json::to_string(&override_info).unwrap()) + .unwrap(); + // File that's absent. + dir.child(&NON_BOOT_2_PATHS.install_dataset).create_dir_all().unwrap(); + // Read error (empty file). + dir.child(&NON_BOOT_3_PATHS.override_json).touch().unwrap(); + + let zpools = ZoneImageZpools { + root: dir.path(), + all_m2_zpools: vec![ + BOOT_ZPOOL, + NON_BOOT_ZPOOL, + NON_BOOT_2_ZPOOL, + NON_BOOT_3_ZPOOL, + ], + }; + let overrides = + AllMupdateOverrides::read_all(&logctx.log, &zpools, &BOOT_ZPOOL); + assert_eq!( + overrides.boot_disk_override.as_ref().unwrap_err(), + &deserialize_error(dir.path(), &BOOT_PATHS.override_json, "",), + ); + assert_eq!( + overrides.non_boot_disk_overrides, + [ + MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_ZPOOL, + path: dir.path().join(&NON_BOOT_PATHS.override_json), + result: MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootDiskReadError { + non_boot_disk_info: Some(override_info), + }, + ), + }, + MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_2_ZPOOL, + path: dir.path().join(&NON_BOOT_2_PATHS.override_json), + result: MupdateOverrideNonBootResult::Mismatch( + MupdateOverrideNonBootMismatch::BootDiskReadError { + non_boot_disk_info: None, + }, + ), + }, + MupdateOverrideNonBootInfo { + zpool_name: NON_BOOT_3_ZPOOL, + path: dir.path().join(&NON_BOOT_3_PATHS.override_json), + result: MupdateOverrideNonBootResult::ReadError( + deserialize_error( + dir.path(), + &NON_BOOT_3_PATHS.override_json, + "", + ), + ), + }, + ] + .into_iter() + .collect(), + ); + + logctx.cleanup_successful(); + } + + fn override_info() -> MupdateOverrideInfo { + MupdateOverrideInfo { + mupdate_uuid: OVERRIDE_UUID, + hash_ids: BTreeSet::new(), + } + } + + fn override_info_2() -> MupdateOverrideInfo { + MupdateOverrideInfo { + mupdate_uuid: OVERRIDE_2_UUID, + hash_ids: BTreeSet::new(), + } + } + + fn dataset_missing_error(dir_path: &Utf8Path) -> MupdateOverrideReadError { + MupdateOverrideReadError::DatasetDirMetadata { + dataset_dir: dir_path.to_owned(), + error: Arc::new(io::Error::from(io::ErrorKind::NotFound)), + } + } + + fn dataset_not_dir_error(dir_path: &Utf8Path) -> MupdateOverrideReadError { + // A `FileType` must unfortunately be retrieved from disk -- can't + // create a new one in-memory. We assume that `dir.path()` passed in + // actually has the described error condition. + MupdateOverrideReadError::DatasetNotDirectory { + dataset_dir: dir_path.to_owned(), + file_type: fs::symlink_metadata(dir_path) + .expect("lstat on dir.path() succeeded") + .file_type(), + } + } + + fn deserialize_error( + dir_path: &Utf8Path, + json_path: &Utf8Path, + contents: &str, + ) -> MupdateOverrideReadError { + MupdateOverrideReadError::Deserialize { + path: dir_path.join(json_path), + contents: contents.to_owned(), + error: Arc::new( + serde_json::from_str::(contents) + .unwrap_err(), + ), + } + } +} diff --git a/sled-agent/zone-images/src/source_resolver.rs b/sled-agent/zone-images/src/source_resolver.rs index cf6d7faff35..96621aca76b 100644 --- a/sled-agent/zone-images/src/source_resolver.rs +++ b/sled-agent/zone-images/src/source_resolver.rs @@ -4,11 +4,15 @@ //! Zone image lookup. +use crate::AllMupdateOverrides; +use camino::Utf8Path; use camino::Utf8PathBuf; +use illumos_utils::zpool::ZpoolName; use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use sled_storage::dataset::INSTALL_DATASET; use sled_storage::dataset::M2_ARTIFACT_DATASET; use sled_storage::resources::AllDisks; +use slog::o; use std::sync::Arc; use std::sync::Mutex; @@ -26,6 +30,16 @@ pub struct ZoneImageFileSource { pub search_paths: Vec, } +/// A description of zpools to examine for zone images. +pub struct ZoneImageZpools<'a> { + /// The root directory, typically `/`. + pub root: &'a Utf8Path, + + /// The full set of M.2 zpools that are currently known. Must be non-empty, + /// but it can include the boot zpool. + pub all_m2_zpools: Vec, +} + /// Resolves [`OmicronZoneImageSource`] instances into file names and search /// paths. /// @@ -37,9 +51,16 @@ pub struct ZoneImageSourceResolver { } impl ZoneImageSourceResolver { - pub fn new() -> Self { - ZoneImageSourceResolver { - inner: Arc::new(Mutex::new(ResolverInner::new())), + /// Creates a new `ZoneImageSourceResolver`. + pub fn new( + log: &slog::Logger, + zpools: &ZoneImageZpools<'_>, + boot_zpool: &ZpoolName, + ) -> Self { + Self { + inner: Arc::new(Mutex::new(ResolverInner::new( + log, zpools, boot_zpool, + ))), } } @@ -64,12 +85,29 @@ impl ZoneImageSourceResolver { #[derive(Debug)] struct ResolverInner { + #[expect(unused)] + log: slog::Logger, image_directory_override: Option, + // Store all collected information for mupdate overrides -- we're going to + // need to report this via inventory. + // + // This isn't actually used yet. + #[expect(unused)] + mupdate_overrides: AllMupdateOverrides, } impl ResolverInner { - fn new() -> Self { - Self { image_directory_override: None } + fn new( + log: &slog::Logger, + zpools: &ZoneImageZpools<'_>, + boot_zpool: &ZpoolName, + ) -> Self { + let log = log.new(o!("component" => "ZoneImageSourceResolver")); + + let mupdate_overrides = + AllMupdateOverrides::read_all(&log, zpools, boot_zpool); + + Self { log, image_directory_override: None, mupdate_overrides } } fn override_image_directory( @@ -77,7 +115,14 @@ impl ResolverInner { image_directory_override: Utf8PathBuf, ) { if let Some(dir) = &self.image_directory_override { - panic!("image_directory_override already set to {dir}"); + // Allow idempotent sets to the same directory -- some tests do + // this. + if image_directory_override != *dir { + panic!( + "image_directory_override already set to `{dir}`, \ + attempting to set it to `{image_directory_override}`" + ); + } } self.image_directory_override = Some(image_directory_override); }