Skip to content

[nexus] track "intended" instance states #8053

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions dev-tools/omdb/src/bin/omdb/db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ use nexus_db_model::ExternalIp;
use nexus_db_model::HwBaseboardId;
use nexus_db_model::Image;
use nexus_db_model::Instance;
use nexus_db_model::InstanceIntendedState;
use nexus_db_model::InvCollection;
use nexus_db_model::InvNvmeDiskFirmware;
use nexus_db_model::InvPhysicalDisk;
Expand Down Expand Up @@ -4351,6 +4352,7 @@ async fn cmd_db_instance_info(
const BOOT_DISK: &'static str = "boot disk";
const AUTO_RESTART: &'static str = "auto-restart";
const STATE: &'static str = "nexus state";
const INTENDED_STATE: &'static str = "intended state";
const LAST_MODIFIED: &'static str = "last modified at";
const LAST_UPDATED: &'static str = "last updated at";
const LAST_AUTO_RESTART: &'static str = " last reincarnated at";
Expand All @@ -4376,6 +4378,7 @@ async fn cmd_db_instance_info(
AUTO_RESTART,
STATE,
API_STATE,
INTENDED_STATE,
LAST_UPDATED,
LAST_MODIFIED,
LAST_AUTO_RESTART,
Expand Down Expand Up @@ -4435,16 +4438,15 @@ async fn cmd_db_instance_info(
"{} {API_STATE:>WIDTH$}: {effective_state:?}",
if effective_state == InstanceState::Failed { "/!\\" } else { "(i)" }
);
println!(" {INTENDED_STATE:>WIDTH$}: {}", instance.intended_state);
println!(
" {LAST_UPDATED:>WIDTH$}: {time_updated:?} (generation {})",
r#gen.0
);

// Reincarnation status
let InstanceKarmicStatus { needs_reincarnation, can_reincarnate } =
instance
.auto_restart
.status(&instance.runtime_state, active_vmm.as_ref());
instance.auto_restart_status(active_vmm.as_ref());
println!(
"{} {NEEDS_REINCARNATION:>WIDTH$}: {needs_reincarnation}",
if needs_reincarnation { "(i)" } else { " " }
Expand Down Expand Up @@ -4790,6 +4792,7 @@ struct VmmStateRow {
struct CustomerInstanceRow {
id: String,
state: String,
intent: InstanceIntendedState,
propolis_id: MaybePropolisId,
sled_id: MaybeSledId,
host_serial: String,
Expand Down Expand Up @@ -4867,6 +4870,7 @@ async fn cmd_db_instances(
id: i.instance().id().to_string(),
name: i.instance().name().to_string(),
state: i.effective_state().to_string(),
intent: i.instance().intended_state,
propolis_id: (&i).into(),
sled_id: (&i).into(),
host_serial,
Expand Down
2 changes: 1 addition & 1 deletion dev-tools/omdb/tests/successes.out
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ EXECUTING COMMAND: omdb ["db", "instances"]
termination: Exited(0)
---------------------------------------------
stdout:
ID STATE PROPOLIS_ID SLED_ID HOST_SERIAL NAME
ID STATE INTENT PROPOLIS_ID SLED_ID HOST_SERIAL NAME
---------------------------------------------
stderr:
note: using database URL postgresql://root@[::1]:REDACTED_PORT/omicron?sslmode=disable
Expand Down
106 changes: 65 additions & 41 deletions nexus/db-model/src/instance.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::InstanceIntendedState as IntendedState;
use super::{
ByteCount, Disk, ExternalIp, Generation, InstanceAutoRestartPolicy,
InstanceCpuCount, InstanceState, Vmm, VmmState,
Expand Down Expand Up @@ -70,6 +71,15 @@ pub struct Instance {
#[diesel(embed)]
pub runtime_state: InstanceRuntimeState,

/// The most recently requested state of the instance.
///
/// This is used to determine whether the instance should be automatically
/// restarted when it fails. Instances that were requested to stop should
/// not be automatically restarted if they have transitioned to the `Failed`
/// state.
#[diesel(column_name = intended_state)]
pub intended_state: IntendedState,

/// A UUID identifying the saga currently holding the update lock on this
/// instance. If this is [`None`] the instance is not locked. Otherwise, if
/// this is [`Some`], the instance is locked by the saga owning this UUID.
Expand Down Expand Up @@ -112,6 +122,12 @@ impl Instance {
cooldown: None,
};

let intended_state = if params.start {
IntendedState::Running
} else {
IntendedState::Stopped
};

Self {
identity,
project_id,
Expand All @@ -125,6 +141,7 @@ impl Instance {
boot_disk_id: None,

runtime_state,
intended_state,

updater_gen: Generation::new(),
updater_id: None,
Expand All @@ -134,6 +151,47 @@ impl Instance {
pub fn runtime(&self) -> &InstanceRuntimeState {
&self.runtime_state
}

/// Returns an instance's karmic status.
pub fn auto_restart_status(
&self,
active_vmm: Option<&Vmm>,
) -> InstanceKarmicStatus {
let state = &self.runtime_state;
// Instances only need to be automatically restarted if they are in the
// `Failed` state, or if their active VMM is in the `SagaUnwound` state.
//
// If the instance's intended state is not Running, it should
// not be reincarnated, even if it has failed.
let needs_reincarnation =
match (self.intended_state, state.nexus_state, active_vmm) {
(IntendedState::Running, InstanceState::Failed, _vmm) => {
debug_assert!(
_vmm.is_none(),
"a Failed instance will never have an active VMM!"
);
true
}
(IntendedState::Running, InstanceState::Vmm, Some(ref vmm)) => {
debug_assert_eq!(
state.propolis_id,
Some(vmm.id),
"don't call `Instance::auto_restart_status` with a VMM \
that isn't this instance's active VMM!?!?"
);
// Note that we *don't* reincarnate instances with `Failed` active
// VMMs; in that case, an instance-update saga must first run to
// move the *instance* record to the `Failed` state.
vmm.runtime.state == VmmState::SagaUnwound
}
_ => false,
};

InstanceKarmicStatus {
needs_reincarnation,
can_reincarnate: self.auto_restart.can_reincarnate(&state),
}
}
}

impl DatastoreAttachTargetConfig<Disk> for Instance {
Expand Down Expand Up @@ -320,43 +378,6 @@ impl InstanceAutoRestart {
pub const DEFAULT_POLICY: InstanceAutoRestartPolicy =
InstanceAutoRestartPolicy::BestEffort;

/// Returns an instance's karmic status.
pub fn status(
&self,
state: &InstanceRuntimeState,
active_vmm: Option<&Vmm>,
) -> InstanceKarmicStatus {
// Instances only need to be automatically restarted if they are in the
// `Failed` state, or if their active VMM is in the `SagaUnwound` state.
let needs_reincarnation = match (state.nexus_state, active_vmm) {
(InstanceState::Failed, _vmm) => {
debug_assert!(
_vmm.is_none(),
"a Failed instance will never have an active VMM!"
);
true
}
(InstanceState::Vmm, Some(ref vmm)) => {
debug_assert_eq!(
state.propolis_id,
Some(vmm.id),
"don't call `InstanceAutoRestart::status with a VMM \
that isn't this instance's active VMM!?!?"
);
// Note that we *don't* reincarnate instances with `Failed` active
// VMMs; in that case, an instance-update saga must first run to
// move the *instance* record to the `Failed` state.
vmm.runtime.state == VmmState::SagaUnwound
}
_ => false,
};

InstanceKarmicStatus {
needs_reincarnation,
can_reincarnate: self.can_reincarnate(&state),
}
}

/// Returns whether or not this auto-restart configuration will permit an
/// instance with the provided `InstanceRuntimeState` to reincarnate.
///
Expand Down Expand Up @@ -425,10 +446,13 @@ impl InstanceAutoRestart {
// If the auto-restart policy is null, then it should
// default to "best effort".
.or(dsl::auto_restart_policy.is_null()))
// An instance whose last reincarnation was within the cooldown
// interval from now must remain in _bardo_ --- the liminal
// state between death and rebirth --- before its next
// reincarnation.
// An instance should only be automatically restarted if it is intended
// to be in the `Running` state. Instances which were requested to stop
// should not be automatically restarted, even if they have failed.
.and(dsl::intended_state.eq(IntendedState::Running))
// instance whose last reincarnation was within the cooldown interval
// from now must remain in _bardo_ --- the liminal state between death
// and rebirth --- before its next reincarnation.
.and(
// If the instance has never previously been reincarnated, then
// it's allowed to reincarnate.
Expand Down
47 changes: 47 additions & 0 deletions nexus/db-model/src/instance_intended_state.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
// This Source Code Form is subject to the terms of the Mozilla Public
// License, v. 2.0. If a copy of the MPL was not distributed with this
// file, You can obtain one at https://mozilla.org/MPL/2.0/.

use super::impl_enum_type;
use serde::Deserialize;
use serde::Serialize;
use std::fmt;

impl_enum_type!(
InstanceIntendedStateEnum:

#[derive(
Copy,
Clone,
Debug,
PartialEq,
AsExpression,
FromSqlRow,
Serialize,
Deserialize,
)]
pub enum InstanceIntendedState;

// Enum values
Running => b"running"
Stopped => b"stopped"
GuestShutdown => b"guest_shutdown"
Destroyed => b"destroyed"
);

impl InstanceIntendedState {
pub fn as_str(&self) -> &'static str {
match self {
Self::Running => "running",
Self::Stopped => "stopped",
Self::GuestShutdown => "guest_shutdown",
Self::Destroyed => "destroyed",
}
}
}

impl fmt::Display for InstanceIntendedState {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
f.write_str(self.as_str())
}
}
2 changes: 2 additions & 0 deletions nexus/db-model/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ mod image;
mod instance;
mod instance_auto_restart_policy;
mod instance_cpu_count;
mod instance_intended_state;
mod instance_state;
mod internet_gateway;
mod inventory;
Expand Down Expand Up @@ -168,6 +169,7 @@ pub use image::*;
pub use instance::*;
pub use instance_auto_restart_policy::*;
pub use instance_cpu_count::*;
pub use instance_intended_state::*;
pub use instance_state::*;
pub use internet_gateway::*;
pub use inventory::*;
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(139, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(140, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(140, "instance-intended-state"),
KnownVersion::new(139, "webhooks"),
KnownVersion::new(138, "saga-abandoned-state"),
KnownVersion::new(137, "oximeter-read-policy"),
Expand Down
Loading
Loading